diff --git a/.dockerignore b/.dockerignore index 7ead48db7c..9fafc2e4ba 100644 --- a/.dockerignore +++ b/.dockerignore @@ -14,6 +14,7 @@ !compute/ !compute_tools/ !control_plane/ +!docker-compose/ext-src !libs/ !pageserver/ !pgxn/ diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md deleted file mode 100644 index 44b3094c24..0000000000 --- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md +++ /dev/null @@ -1,21 +0,0 @@ -## Release 202Y-MM-DD - -**NB: this PR must be merged only by 'Create a merge commit'!** - -### Checklist when preparing for release -- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b) -- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers? -- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan? - - - -### Checklist after release -- [ ] Make sure instructions from PRs included in this release and labeled `manual_release_instructions` are executed (either by you or by people who wrote them). -- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files)) -- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel -- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) -- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some) -- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1) -- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time) - - diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 1e6c2d0aa2..39a30d9a39 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -32,3 +32,6 @@ config-variables: - NEON_DEV_AWS_ACCOUNT_ID - NEON_PROD_AWS_ACCOUNT_ID - AWS_ECR_REGION + - BENCHMARK_LARGE_OLTP_PROJECTID + - SLACK_ON_CALL_DEVPROD_STREAM + - SLACK_RUST_CHANNEL_ID diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index 9f752d5a89..71dd6f3af2 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -84,7 +84,13 @@ runs: --header "Authorization: Bearer ${API_KEY}" ) - role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name') + role_name=$(echo "$roles" | jq --raw-output ' + (.roles | map(select(.protected == false))) as $roles | + if any($roles[]; .name == "neondb_owner") + then "neondb_owner" + else $roles[0].name + end + ') echo "role_name=${role_name}" >> $GITHUB_OUTPUT env: API_HOST: ${{ inputs.api_host }} @@ -107,13 +113,13 @@ runs: ) if [ -z "${reset_password}" ]; then - sleep 1 + sleep $i continue fi password=$(echo $reset_password | jq --raw-output '.role.password') if [ "${password}" == "null" ]; then - sleep 1 + sleep $i # increasing backoff continue fi diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 122fe48b68..fa6f882161 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -44,6 +44,11 @@ inputs: description: 'Postgres version to use for tests' required: false default: 'v16' + sanitizers: + description: 'enabled or disabled' + required: false + default: 'disabled' + type: string benchmark_durations: description: 'benchmark durations JSON' required: false @@ -59,7 +64,7 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact path: /tmp/neon aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} @@ -112,6 +117,7 @@ runs: ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') RERUN_FAILED: ${{ inputs.rerun_failed }} PG_VERSION: ${{ inputs.pg_version }} + SANITIZERS: ${{ inputs.sanitizers }} shell: bash -euxo pipefail {0} run: | # PLATFORM will be embedded in the perf test report diff --git a/.github/scripts/generate_image_maps.py b/.github/scripts/generate_image_maps.py new file mode 100644 index 0000000000..f67e07024c --- /dev/null +++ b/.github/scripts/generate_image_maps.py @@ -0,0 +1,63 @@ +import itertools +import json +import os +import sys + +source_tag = os.getenv("SOURCE_TAG") +target_tag = os.getenv("TARGET_TAG") +branch = os.getenv("BRANCH") +dev_acr = os.getenv("DEV_ACR") +prod_acr = os.getenv("PROD_ACR") +dev_aws = os.getenv("DEV_AWS") +prod_aws = os.getenv("PROD_AWS") +aws_region = os.getenv("AWS_REGION") + +components = { + "neon": ["neon"], + "compute": [ + "compute-node-v14", + "compute-node-v15", + "compute-node-v16", + "compute-node-v17", + "vm-compute-node-v14", + "vm-compute-node-v15", + "vm-compute-node-v16", + "vm-compute-node-v17", + ], +} + +registries = { + "dev": [ + "docker.io/neondatabase", + "ghcr.io/neondatabase", + f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com", + f"{dev_acr}.azurecr.io/neondatabase", + ], + "prod": [ + f"{prod_aws}.dkr.ecr.{aws_region}.amazonaws.com", + f"{prod_acr}.azurecr.io/neondatabase", + ], +} + +outputs: dict[str, dict[str, list[str]]] = {} + +target_tags = [target_tag, "latest"] if branch == "main" else [target_tag] +target_stages = ( + ["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"] +) + +for component_name, component_images in components.items(): + for stage in target_stages: + outputs[f"{component_name}-{stage}"] = { + f"docker.io/neondatabase/{component_image}:{source_tag}": [ + f"{registry}/{component_image}:{tag}" + for registry, tag in itertools.product(registries[stage], target_tags) + if not (registry == "docker.io/neondatabase" and tag == source_tag) + ] + for component_image in component_images + } + +with open(os.getenv("GITHUB_OUTPUT", "/dev/null"), "a") as f: + for key, value in outputs.items(): + f.write(f"{key}={json.dumps(value)}\n") + print(f"Image map for {key}:\n{json.dumps(value, indent=2)}\n\n", file=sys.stderr) diff --git a/.github/scripts/lint-release-pr.sh b/.github/scripts/lint-release-pr.sh new file mode 100755 index 0000000000..6dc5b99f0e --- /dev/null +++ b/.github/scripts/lint-release-pr.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash + +set -euo pipefail + +DOCS_URL="https://docs.neon.build/overview/repositories/neon.html" + +message() { + if [[ -n "${GITHUB_PR_NUMBER:-}" ]]; then + gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --edit-last --body "$1" \ + || gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --body "$1" + fi + echo "$1" +} + +report_error() { + message "❌ $1 + For more details, see the documentation: ${DOCS_URL}" + + exit 1 +} + +case "$RELEASE_BRANCH" in + "release") COMPONENT="Storage" ;; + "release-proxy") COMPONENT="Proxy" ;; + "release-compute") COMPONENT="Compute" ;; + *) + report_error "Unknown release branch: ${RELEASE_BRANCH}" + ;; +esac + + +# Identify main and release branches +MAIN_BRANCH="origin/main" +REMOTE_RELEASE_BRANCH="origin/${RELEASE_BRANCH}" + +# Find merge base +MERGE_BASE=$(git merge-base "${MAIN_BRANCH}" "${REMOTE_RELEASE_BRANCH}") +echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}" + +# Get the HEAD commit (last commit in PR, expected to be the merge commit) +LAST_COMMIT=$(git rev-parse HEAD) + +MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}") +EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$" + +if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then + report_error "Merge commit message does not match expected pattern: ' release YYYY-MM-DD' + Expected component: ${COMPONENT} + Found: '${MERGE_COMMIT_MESSAGE}'" +fi +echo "✅ Merge commit message is correctly formatted: '${MERGE_COMMIT_MESSAGE}'" + +LAST_COMMIT_PARENTS=$(git cat-file -p "${LAST_COMMIT}" | jq -sR '[capture("parent (?[0-9a-f]{40})"; "g") | .parent]') + +if [[ "$(echo "${LAST_COMMIT_PARENTS}" | jq 'length')" -ne 2 ]]; then + report_error "Last commit must be a merge commit with exactly two parents" +fi + +EXPECTED_RELEASE_HEAD=$(git rev-parse "${REMOTE_RELEASE_BRANCH}") +if echo "${LAST_COMMIT_PARENTS}" | jq -e --arg rel "${EXPECTED_RELEASE_HEAD}" 'index($rel) != null' > /dev/null; then + LINEAR_HEAD=$(echo "${LAST_COMMIT_PARENTS}" | jq -r '[.[] | select(. != $rel)][0]' --arg rel "${EXPECTED_RELEASE_HEAD}") +else + report_error "Last commit must merge the release branch (${RELEASE_BRANCH})" +fi +echo "✅ Last commit correctly merges the previous commit and the release branch" +echo "Top commit of linear history: ${LINEAR_HEAD}" + +MERGE_COMMIT_TREE=$(git rev-parse "${LAST_COMMIT}^{tree}") +LINEAR_HEAD_TREE=$(git rev-parse "${LINEAR_HEAD}^{tree}") + +if [[ "${MERGE_COMMIT_TREE}" != "${LINEAR_HEAD_TREE}" ]]; then + report_error "Tree of merge commit (${MERGE_COMMIT_TREE}) does not match tree of linear history head (${LINEAR_HEAD_TREE}) + This indicates that the merge of ${RELEASE_BRANCH} into this branch was not performed using the merge strategy 'ours'" +fi +echo "✅ Merge commit tree matches the linear history head" + +EXPECTED_PREVIOUS_COMMIT="${LINEAR_HEAD}" + +# Now traverse down the history, ensuring each commit has exactly one parent +CURRENT_COMMIT="${EXPECTED_PREVIOUS_COMMIT}" +while [[ "${CURRENT_COMMIT}" != "${MERGE_BASE}" && "${CURRENT_COMMIT}" != "${EXPECTED_RELEASE_HEAD}" ]]; do + CURRENT_COMMIT_PARENTS=$(git cat-file -p "${CURRENT_COMMIT}" | jq -sR '[capture("parent (?[0-9a-f]{40})"; "g") | .parent]') + + if [[ "$(echo "${CURRENT_COMMIT_PARENTS}" | jq 'length')" -ne 1 ]]; then + report_error "Commit ${CURRENT_COMMIT} must have exactly one parent" + fi + + NEXT_COMMIT=$(echo "${CURRENT_COMMIT_PARENTS}" | jq -r '.[0]') + + if [[ "${NEXT_COMMIT}" == "${MERGE_BASE}" ]]; then + echo "✅ Reached merge base (${MERGE_BASE})" + PR_BASE="${MERGE_BASE}" + elif [[ "${NEXT_COMMIT}" == "${EXPECTED_RELEASE_HEAD}" ]]; then + echo "✅ Reached release branch (${EXPECTED_RELEASE_HEAD})" + PR_BASE="${EXPECTED_RELEASE_HEAD}" + elif [[ -z "${NEXT_COMMIT}" ]]; then + report_error "Unexpected end of commit history before reaching merge base" + fi + + # Move to the next commit in the chain + CURRENT_COMMIT="${NEXT_COMMIT}" +done + +echo "✅ All commits are properly ordered and linear" +echo "✅ Release PR structure is valid" + +echo + +message "Commits that are part of this release: +$(git log --oneline "${PR_BASE}..${LINEAR_HEAD}")" diff --git a/.github/scripts/previous-releases.jq b/.github/scripts/previous-releases.jq new file mode 100644 index 0000000000..51204da099 --- /dev/null +++ b/.github/scripts/previous-releases.jq @@ -0,0 +1,31 @@ +# Expects response from https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#list-releases as input, +# with tag names `release` for storage, `release-compute` for compute and `release-proxy` for proxy releases. +# Extract only the `tag_name` field from each release object +[ .[].tag_name ] + +# Transform each tag name into a structured object using regex capture +| reduce map( + capture("^(?release(-(?proxy|compute))?-(?\\d+))$") + | { + component: (.component // "storage"), # Default to "storage" if no component is specified + version: (.version | tonumber), # Convert the version number to an integer + full: .full # Store the full tag name for final output + } + )[] as $entry # Loop over the transformed list + +# Accumulate the latest (highest-numbered) version for each component +({}; + .[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end)) + +# Ensure that each component exists, or fail +| (["storage", "compute", "proxy"] - (keys)) as $missing +| if ($missing | length) > 0 then + "Error: Found no release for \($missing | join(", "))!\n" | halt_error(1) + else . end + +# Convert the resulting object into an array of formatted strings +| to_entries +| map("\(.key)=\(.value.full)") + +# Output each string separately +| .[] diff --git a/scripts/push_with_image_map.py b/.github/scripts/push_with_image_map.py similarity index 100% rename from scripts/push_with_image_map.py rename to .github/scripts/push_with_image_map.py diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 3740e6dc9c..6a2070424a 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -280,7 +280,7 @@ jobs: - name: Upload Neon artifact uses: ./.github/actions/upload with: - name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact path: /tmp/neon aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -337,7 +337,7 @@ jobs: - name: Pytest regression tests continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }} uses: ./.github/actions/run-python-test-set - timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }} + timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }} with: build_type: ${{ inputs.build-type }} test_selection: regress @@ -347,6 +347,7 @@ jobs: real_s3_region: eu-central-1 rerun_failed: true pg_version: ${{ matrix.pg_version }} + sanitizers: ${{ inputs.sanitizers }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds. # Attempt to stop tests gracefully to generate test reports @@ -359,7 +360,6 @@ jobs: PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} - SANITIZERS: ${{ inputs.sanitizers }} # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml index 3c130c8229..9b1d1aa454 100644 --- a/.github/workflows/_create-release-pr.yml +++ b/.github/workflows/_create-release-pr.yml @@ -7,8 +7,8 @@ on: description: 'Component name' required: true type: string - release-branch: - description: 'Release branch' + source-branch: + description: 'Source branch' required: true type: string secrets: @@ -30,17 +30,25 @@ jobs: steps: - uses: actions/checkout@v4 with: - ref: main + ref: ${{ inputs.source-branch }} + fetch-depth: 0 - name: Set variables id: vars env: COMPONENT_NAME: ${{ inputs.component-name }} - RELEASE_BRANCH: ${{ inputs.release-branch }} + RELEASE_BRANCH: >- + ${{ + false + || inputs.component-name == 'Storage' && 'release' + || inputs.component-name == 'Proxy' && 'release-proxy' + || inputs.component-name == 'Compute' && 'release-compute' + }} run: | today=$(date +'%Y-%m-%d') echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT} echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT} + echo "release-branch=${RELEASE_BRANCH}" | tee -a ${GITHUB_OUTPUT} - name: Configure git run: | @@ -49,31 +57,36 @@ jobs: - name: Create RC branch env: + RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} TITLE: ${{ steps.vars.outputs.title }} run: | - git checkout -b "${RC_BRANCH}" + git switch -c "${RC_BRANCH}" - # create an empty commit to distinguish workflow runs - # from other possible releases from the same commit - git commit --allow-empty -m "${TITLE}" + # Manually create a merge commit on the current branch, keeping the + # tree and setting the parents to the current HEAD and the HEAD of the + # release branch. This commit is what we'll fast-forward the release + # branch to when merging the release branch. + # For details on why, look at + # https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs + current_tree=$(git rev-parse 'HEAD^{tree}') + release_head=$(git rev-parse "origin/${RELEASE_BRANCH}") + current_head=$(git rev-parse HEAD) + merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}") + + # Fast-forward the current branch to the newly created merge_commit + git merge --ff-only ${merge_commit} git push origin "${RC_BRANCH}" - - name: Create a PR into ${{ inputs.release-branch }} + - name: Create a PR into ${{ steps.vars.outputs.release-branch }} env: GH_TOKEN: ${{ secrets.ci-access-token }} RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} - RELEASE_BRANCH: ${{ inputs.release-branch }} + RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} TITLE: ${{ steps.vars.outputs.title }} run: | - cat << EOF > body.md - ## ${TITLE} - - **Please merge this Pull Request using 'Create a merge commit' button** - EOF - gh pr create --title "${TITLE}" \ - --body-file "body.md" \ + --body "" \ --head "${RC_BRANCH}" \ --base "${RELEASE_BRANCH}" diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml new file mode 100644 index 0000000000..bb2f9fa5d9 --- /dev/null +++ b/.github/workflows/_meta.yml @@ -0,0 +1,129 @@ +name: Generate run metadata +on: + workflow_call: + inputs: + github-event-name: + type: string + required: true + outputs: + build-tag: + description: "Tag for the current workflow run" + value: ${{ jobs.tags.outputs.build-tag }} + previous-storage-release: + description: "Tag of the last storage release" + value: ${{ jobs.tags.outputs.storage }} + previous-proxy-release: + description: "Tag of the last proxy release" + value: ${{ jobs.tags.outputs.proxy }} + previous-compute-release: + description: "Tag of the last compute release" + value: ${{ jobs.tags.outputs.compute }} + run-kind: + description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`" + value: ${{ jobs.tags.outputs.run-kind }} + release-pr-run-id: + description: "Only available if `run-kind in [storage-release, proxy-release, compute-release]`. Contains the run ID of the `Build and Test` workflow, assuming one with the current commit can be found." + value: ${{ jobs.tags.outputs.release-pr-run-id }} + +permissions: {} + +defaults: + run: + shell: bash -euo pipefail {0} + +jobs: + tags: + runs-on: ubuntu-22.04 + outputs: + build-tag: ${{ steps.build-tag.outputs.tag }} + compute: ${{ steps.previous-releases.outputs.compute }} + proxy: ${{ steps.previous-releases.outputs.proxy }} + storage: ${{ steps.previous-releases.outputs.storage }} + run-kind: ${{ steps.run-kind.outputs.run-kind }} + release-pr-run-id: ${{ steps.release-pr-run-id.outputs.release-pr-run-id }} + permissions: + contents: read + steps: + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get run kind + id: run-kind + env: + RUN_KIND: >- + ${{ + false + || (inputs.github-event-name == 'push' && github.ref_name == 'main') && 'push-main' + || (inputs.github-event-name == 'push' && github.ref_name == 'release') && 'storage-release' + || (inputs.github-event-name == 'push' && github.ref_name == 'release-compute') && 'compute-release' + || (inputs.github-event-name == 'push' && github.ref_name == 'release-proxy') && 'proxy-release' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release') && 'storage-rc-pr' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr' + || (inputs.github-event-name == 'pull_request') && 'pr' + || (inputs.github-event-name == 'workflow_dispatch') && 'workflow-dispatch' + || 'unknown' + }} + run: | + echo "run-kind=$RUN_KIND" | tee -a $GITHUB_OUTPUT + + - name: Get build tag + id: build-tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} + CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + RUN_KIND: ${{ steps.run-kind.outputs.run-kind }} + run: | + case $RUN_KIND in + push-main) + echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + storage-release) + echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + proxy-release) + echo "tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + compute-release) + echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr) + BUILD_AND_TEST_RUN_ID=$(gh api --paginate \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=${CURRENT_SHA}&branch=${CURRENT_BRANCH}" \ + | jq '[.workflow_runs[] | select(.name == "Build and Test")][0].id // ("Error: No matching workflow run found." | halt_error(1))') + echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT + ;; + workflow-dispatch) + echo "tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT + ;; + *) + echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!" + exit 1 + esac + + - name: Get the previous release-tags + id: previous-releases + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api --paginate \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/releases" \ + | jq -f .github/scripts/previous-releases.jq -r \ + | tee -a "${GITHUB_OUTPUT}" + + - name: Get the release PR run ID + id: release-pr-run-id + if: ${{ contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + run: | + RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from RC PR!" | halt_error(1))') + echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index c938f62ad5..2dab665f40 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -11,8 +11,12 @@ on: description: AWS region to log in to. Required when pushing to ECR. required: false type: string - aws-account-ids: - description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR. + aws-account-id: + description: AWS account ID to log in to for pushing to ECR. Required when pushing to ECR. + required: false + type: string + aws-role-to-assume: + description: AWS role to assume to for pushing to ECR. Required when pushing to ECR. required: false type: string azure-client-id: @@ -31,16 +35,6 @@ on: description: ACR registry name. Required when pushing to ACR. required: false type: string - secrets: - docker-hub-username: - description: Docker Hub username. Required when pushing to Docker Hub. - required: false - docker-hub-password: - description: Docker Hub password. Required when pushing to Docker Hub. - required: false - aws-role-to-assume: - description: AWS role to assume. Required when pushing to ECR. - required: false permissions: {} @@ -53,10 +47,11 @@ jobs: runs-on: ubuntu-22.04 permissions: id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR steps: - uses: actions/checkout@v4 with: - sparse-checkout: scripts/push_with_image_map.py + sparse-checkout: .github/scripts/push_with_image_map.py sparse-checkout-cone-mode: false - name: Print image-map @@ -67,14 +62,14 @@ jobs: uses: aws-actions/configure-aws-credentials@v4 with: aws-region: "${{ inputs.aws-region }}" - role-to-assume: "${{ secrets.aws-role-to-assume }}" + role-to-assume: "arn:aws:iam::${{ inputs.aws-account-id }}:role/${{ inputs.aws-role-to-assume }}" role-duration-seconds: 3600 - name: Login to ECR if: contains(inputs.image-map, 'amazonaws.com/') uses: aws-actions/amazon-ecr-login@v2 with: - registries: "${{ inputs.aws-account-ids }}" + registries: "${{ inputs.aws-account-id }}" - name: Configure Azure credentials if: contains(inputs.image-map, 'azurecr.io/') @@ -89,13 +84,21 @@ jobs: run: | az acr login --name=${{ inputs.acr-registry-name }} + - name: Login to GHCR + if: contains(inputs.image-map, 'ghcr.io/') + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Log in to Docker Hub uses: docker/login-action@v3 with: - username: ${{ secrets.docker-hub-username }} - password: ${{ secrets.docker-hub-password }} + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Copy docker images to target registries - run: python scripts/push_with_image_map.py + run: python3 .github/scripts/push_with_image_map.py env: IMAGE_MAP: ${{ inputs.image-map }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index b36ac46f35..ff7db02e42 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -140,6 +140,9 @@ jobs: --ignore test_runner/performance/test_logical_replication.py --ignore test_runner/performance/test_physical_replication.py --ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py + --ignore test_runner/performance/test_cumulative_statistics_persistence.py + --ignore test_runner/performance/test_perf_many_relations.py + --ignore test_runner/performance/test_perf_oltp_large_tenant.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -171,6 +174,61 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + cumstats-test: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 17 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-staging" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Verify that cumulative statistics are preserved + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_cumulative_statistics_persistence.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 3600 + pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + replication-tests: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: @@ -398,6 +456,9 @@ jobs: runs-on: ${{ matrix.runner }} container: image: ${{ matrix.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init # Increase timeout to 8h, default timeout is 6h diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8f3392ceea..409ad6be3d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -65,38 +65,11 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} filters: .github/file-filters.yaml - tag: + meta: needs: [ check-permissions ] - runs-on: [ self-hosted, small ] - container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned - outputs: - build-tag: ${{steps.build-tag.outputs.tag}} - - steps: - # Need `fetch-depth: 0` to count the number of commits in the branch - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get build tag - run: | - echo run:$GITHUB_RUN_ID - echo ref:$GITHUB_REF_NAME - echo rev:$(git rev-list --count HEAD) - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then - echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" - echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT - fi - shell: bash - id: build-tag + uses: ./.github/workflows/_meta.yml + with: + github-event-name: ${{ github.event_name }} build-build-tools-image: needs: [ check-permissions ] @@ -199,7 +172,7 @@ jobs: secrets: inherit build-and-test-locally: - needs: [ tag, build-build-tools-image ] + needs: [ meta, build-build-tools-image ] strategy: fail-fast: false matrix: @@ -213,7 +186,7 @@ jobs: with: arch: ${{ matrix.arch }} build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm - build-tag: ${{ needs.tag.outputs.build-tag }} + build-tag: ${{ needs.meta.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds. # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. @@ -497,13 +470,24 @@ jobs: }) trigger-e2e-tests: - if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }} - needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ] + # Depends on jobs that can get skipped + if: >- + ${{ + ( + !github.event.pull_request.draft + || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') + || needs.meta.outputs.run-kind == 'push-main' + ) && !failure() && !cancelled() + }} + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ] uses: ./.github/workflows/trigger-e2e-tests.yml + with: + github-event-name: ${{ github.event_name }} secrets: inherit neon-image-arch: - needs: [ check-permissions, build-build-tools-image, tag ] + needs: [ check-permissions, build-build-tools-image, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: matrix: arch: [ x64, arm64 ] @@ -539,7 +523,7 @@ jobs: build-args: | ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm DEBIAN_VERSION=bookworm provenance: false @@ -549,10 +533,11 @@ jobs: cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }} + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: - needs: [ neon-image-arch, tag ] + needs: [ neon-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials @@ -567,13 +552,14 @@ jobs: - name: Create multi-arch image run: | - docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 + docker buildx imagetools create -t neondatabase/neon:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \ + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \ + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 compute-node-image-arch: - needs: [ check-permissions, build-build-tools-image, tag ] + needs: [ check-permissions, build-build-tools-image, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -631,7 +617,7 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false @@ -641,7 +627,7 @@ jobs: cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg >= 'v16' @@ -651,7 +637,7 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false @@ -661,10 +647,11 @@ jobs: target: extension-tests cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: - needs: [ compute-node-image-arch, tag ] + needs: [ compute-node-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -692,27 +679,28 @@ jobs: - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image if: matrix.version.pg >= 'v16' run: | - docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - vm-compute-node-image: - needs: [ check-permissions, tag, compute-node-image ] - runs-on: [ self-hosted, large ] + vm-compute-node-image-arch: + needs: [ check-permissions, meta, compute-node-image ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} strategy: fail-fast: false matrix: + arch: [ amd64, arm64 ] version: - # see the comment for `compute-node-image-arch` job - pg: v14 debian: bullseye - pg: v15 @@ -722,14 +710,14 @@ jobs: - pg: v17 debian: bookworm env: - VM_BUILDER_VERSION: v0.37.1 + VM_BUILDER_VERSION: v0.42.2 steps: - uses: actions/checkout@v4 - name: Downloading vm-builder run: | - curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder + curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder chmod +x vm-builder - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 @@ -742,22 +730,55 @@ jobs: # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -size=2G \ -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ - -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \ + -target-arch=linux/${{ matrix.arch }} - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} + + vm-compute-node-image: + needs: [ vm-compute-node-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + runs-on: ubuntu-22.04 + strategy: + matrix: + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + - pg: v15 + - pg: v16 + - pg: v17 + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch compute-node image + run: | + docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \ + neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64 + test-images: - needs: [ check-permissions, tag, neon-image, compute-node-image ] + needs: [ check-permissions, meta, neon-image, compute-node-image ] + # Depends on jobs that can get skipped + if: >- + ${{ + !failure() + && !cancelled() + && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + }} strategy: fail-fast: false matrix: @@ -775,17 +796,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Get the last compute release tag - id: get-last-compute-release-tag - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - run: | - tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ - -H "Accept: application/vnd.github+json" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - "/repos/${{ github.repository }}/releases") - echo tag=${tag} >> ${GITHUB_OUTPUT} - # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like @@ -795,8 +805,9 @@ jobs: # Ensure that we don't have bad versions. - name: Verify image versions shell: bash # ensure no set -e for better error messages + if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} run: | - pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" @@ -813,7 +824,24 @@ jobs: - name: Verify docker-compose example and test extensions timeout-minutes: 20 env: - TAG: ${{needs.tag.outputs.build-tag}} + TAG: >- + ${{ + needs.meta.outputs.run-kind == 'compute-rc-pr' + && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.build-tag + }} + COMPUTE_TAG: >- + ${{ + contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-compute-release + || needs.meta.outputs.build-tag + }} + TEST_EXTENSIONS_TAG: >- + ${{ + contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && 'latest' + || needs.meta.outputs.build-tag + }} TEST_VERSION_ONLY: ${{ matrix.pg_version }} run: ./docker-compose/docker_compose_test.sh @@ -825,10 +853,17 @@ jobs: - name: Test extension upgrade timeout-minutes: 20 - if: ${{ needs.tag.outputs.build-tag == github.run_id }} + if: ${{ contains(fromJSON('["pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} env: - NEWTAG: ${{ needs.tag.outputs.build-tag }} - OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + TAG: >- + ${{ + false + || needs.meta.outputs.run-kind == 'pr' && needs.meta.outputs.build-tag + || needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release + }} + TEST_EXTENSIONS_TAG: ${{ needs.meta.outputs.previous-compute-release }} + NEW_COMPUTE_TAG: ${{ needs.meta.outputs.build-tag }} + OLD_COMPUTE_TAG: ${{ needs.meta.outputs.previous-compute-release }} run: ./docker-compose/test_extensions_upgrade.sh - name: Print logs and clean up @@ -838,7 +873,7 @@ jobs: docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down generate-image-maps: - needs: [ tag ] + needs: [ meta ] runs-on: ubuntu-22.04 outputs: neon-dev: ${{ steps.generate.outputs.neon-dev }} @@ -848,14 +883,20 @@ jobs: steps: - uses: actions/checkout@v4 with: - sparse-checkout: scripts/generate_image_maps.py + sparse-checkout: .github/scripts/generate_image_maps.py sparse-checkout-cone-mode: false - name: Generate Image Maps id: generate - run: python scripts/generate_image_maps.py + run: python3 .github/scripts/generate_image_maps.py env: - BUILD_TAG: "${{ needs.tag.outputs.build-tag }}" + SOURCE_TAG: >- + ${{ + contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.release-pr-run-id + || needs.meta.outputs.build-tag + }} + TARGET_TAG: ${{ needs.meta.outputs.build-tag }} BRANCH: "${{ github.ref_name }}" DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" @@ -864,88 +905,134 @@ jobs: AWS_REGION: "${{ vars.AWS_ECR_REGION }}" push-neon-image-dev: - needs: [ generate-image-maps, neon-image ] + needs: [ meta, generate-image-maps, neon-image ] + if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit push-compute-image-dev: - needs: [ generate-image-maps, vm-compute-node-image ] + needs: [ meta, generate-image-maps, vm-compute-node-image ] + if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit push-neon-image-prod: - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ generate-image-maps, neon-image, test-images ] + needs: [ meta, generate-image-maps, neon-image, test-images ] + # Depends on jobs that can get skipped + if: ${{ !failure() && !cancelled() && contains(fromJSON('["storage-release", "proxy-release"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit push-compute-image-prod: - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ generate-image-maps, vm-compute-node-image, test-images ] + needs: [ meta, generate-image-maps, vm-compute-node-image, test-images ] + # Depends on jobs that can get skipped + if: ${{ !failure() && !cancelled() && needs.meta.outputs.run-kind == 'compute-release' }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit - # This is a bit of a special case so we're not using a generated image map. - add-latest-tag-to-neon-extensions-test-image: - if: github.ref_name == 'main' - needs: [ tag, compute-node-image ] + push-neon-test-extensions-image-ghcr: + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + needs: [ meta, compute-node-image ] uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: | { - "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], - "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" + ], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" + ] } - secrets: - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit + + add-latest-tag-to-neon-test-extensions-image: + if: ${{ needs.meta.outputs.run-kind == 'push-main' }} + needs: [ meta, compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ + "docker.io/neondatabase/neon-test-extensions-v16:latest", + "ghcr.io/neondatabase/neon-test-extensions-v16:latest" + ], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ + "docker.io/neondatabase/neon-test-extensions-v17:latest", + "ghcr.io/neondatabase/neon-test-extensions-v17:latest" + ] + } + secrets: inherit + + add-release-tag-to-neon-test-extensions-image: + if: ${{ needs.meta.outputs.run-kind == 'compute-release' }} + needs: [ meta, compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.release-pr-run-id }}": [ + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}", + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" + ], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.release-pr-run-id }}": [ + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}", + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" + ] + } + secrets: inherit trigger-custom-extensions-build-and-wait: - needs: [ check-permissions, tag ] + needs: [ check-permissions, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials @@ -980,7 +1067,7 @@ jobs: \"ci_job_name\": \"build-and-upload-extensions\", \"commit_hash\": \"$COMMIT_SHA\", \"remote_repo\": \"${{ github.repository }}\", - \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", + \"compute_image_tag\": \"${{ needs.meta.outputs.build-tag }}\", \"remote_branch_name\": \"${{ github.ref_name }}\" } }" @@ -1024,9 +1111,9 @@ jobs: exit 1 deploy: - needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] - # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` - if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled() + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ] + # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` + if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -1037,125 +1124,121 @@ jobs: - uses: actions/checkout@v4 - name: Create git tag and GitHub release - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' + if: ${{ contains(fromJSON('["storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) }} uses: actions/github-script@v7 + env: + TAG: "${{ needs.meta.outputs.build-tag }}" + BRANCH: "${{ github.ref_name }}" + PREVIOUS_RELEASE: >- + ${{ + false + || needs.meta.outputs.run-kind == 'storage-release' && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.run-kind == 'proxy-release' && needs.meta.outputs.previous-proxy-release + || needs.meta.outputs.run-kind == 'compute-release' && needs.meta.outputs.previous-compute-release + || 'unknown' + }} with: retries: 5 script: | - const tag = "${{ needs.tag.outputs.build-tag }}"; - const branch = "${{ github.ref_name }}"; + const { TAG, BRANCH, PREVIOUS_RELEASE } = process.env try { const existingRef = await github.rest.git.getRef({ owner: context.repo.owner, repo: context.repo.repo, - ref: `tags/${tag}`, + ref: `tags/${TAG}`, }); if (existingRef.data.object.sha !== context.sha) { - throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`); + throw new Error(`Tag ${TAG} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`); } - console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`); + console.log(`Tag ${TAG} already exists and points to ${context.sha} as expected.`); } catch (error) { if (error.status !== 404) { throw error; } - console.log(`Tag ${tag} does not exist. Creating it...`); + console.log(`Tag ${TAG} does not exist. Creating it...`); await github.rest.git.createRef({ owner: context.repo.owner, repo: context.repo.repo, - ref: `refs/tags/${tag}`, + ref: `refs/tags/${TAG}`, sha: context.sha, }); - console.log(`Tag ${tag} created successfully.`); + console.log(`Tag ${TAG} created successfully.`); } try { const existingRelease = await github.rest.repos.getReleaseByTag({ owner: context.repo.owner, repo: context.repo.repo, - tag: tag, + tag: TAG, }); - console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`); + console.log(`Release for tag ${TAG} already exists (ID: ${existingRelease.data.id}).`); } catch (error) { if (error.status !== 404) { throw error; } - console.log(`Release for tag ${tag} does not exist. Creating it...`); + console.log(`Release for tag ${TAG} does not exist. Creating it...`); // Find the PR number using the commit SHA const pullRequests = await github.rest.pulls.list({ owner: context.repo.owner, repo: context.repo.repo, state: 'closed', - base: branch, + base: BRANCH, }); const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha); const prNumber = pr ? pr.number : null; - // Find the previous release on the branch - const releases = await github.rest.repos.listReleases({ - owner: context.repo.owner, - repo: context.repo.repo, - per_page: 100, - }); - - const branchReleases = releases.data - .filter((release) => { - const regex = new RegExp(`^${branch}-\\d+$`); - return regex.test(release.tag_name) && !release.draft && !release.prerelease; - }) - .sort((a, b) => new Date(b.created_at) - new Date(a.created_at)); - - const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null; - const releaseNotes = [ prNumber ? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.` : 'Release PR not found.', - previousTag - ? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.` - : `No previous release found on branch ${branch}.`, + `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${PREVIOUS_RELEASE}...${TAG}.` ].join('\n\n'); await github.rest.repos.createRelease({ owner: context.repo.owner, repo: context.repo.repo, - tag_name: tag, + tag_name: TAG, body: releaseNotes, }); - console.log(`Release for tag ${tag} created successfully.`); + console.log(`Release for tag ${TAG} created successfully.`); } - name: Trigger deploy workflow env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + RUN_KIND: ${{ needs.meta.outputs.run-kind }} run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + case ${RUN_KIND} in + push-main) + gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.meta.outputs.build-tag}} -f deployPreprodRegion=false + ;; + storage-release) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ -f deployProxy=false \ -f deployStorage=true \ - -f deployStorageBroker=true \ + -f deployStorageBroker=false \ -f deployStorageController=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f dockerTag=${{needs.meta.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \ -f deployStorage=true \ - -f deployStorageBroker=true \ + -f deployStorageBroker=false \ -f deployStorageController=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + proxy-release) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxy=true \ @@ -1163,7 +1246,7 @@ jobs: -f deployStorageBroker=false \ -f deployStorageController=false \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f dockerTag=${{needs.meta.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \ @@ -1173,13 +1256,16 @@ jobs: -f deployProxyScram=true \ -f deployProxyAuthBroker=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}} - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'" + -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + compute-release) + gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + *) + echo "RUN_KIND (value '${RUN_KIND}') is not set to either 'push-main', 'storage-release', 'proxy-release' or 'compute-release'" exit 1 - fi + ;; + esac notify-storage-release-deploy-failure: needs: [ deploy ] @@ -1195,51 +1281,20 @@ jobs: payload: | channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }} text: | - 🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. + 🔴 : deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: - needs: [ deploy ] + needs: [ meta, deploy ] permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read - # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` + # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` if: github.ref_name == 'release' && !failure() && !cancelled() runs-on: ubuntu-22.04 steps: - - name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR - id: fetch-last-release-pr-info - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - branch_name_and_pr_number=$(gh pr list \ - --repo "${GITHUB_REPOSITORY}" \ - --base release \ - --state merged \ - --limit 10 \ - --json mergeCommit,headRefName,number \ - --jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }") - branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name') - pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number') - - run_id=$(gh run list \ - --repo "${GITHUB_REPOSITORY}" \ - --workflow build_and_test.yml \ - --branch "${branch_name}" \ - --json databaseId \ - --limit 1 \ - --jq '.[].databaseId') - - last_commit_sha=$(gh pr view "${pr_number}" \ - --repo "${GITHUB_REPOSITORY}" \ - --json commits \ - --jq '.commits[-1].oid') - - echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT} - echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT} - - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 @@ -1250,8 +1305,8 @@ jobs: env: BUCKET: neon-github-public-dev AWS_REGION: eu-central-1 - COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }} - RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }} + COMMIT_SHA: ${{ github.sha }} + RUN_ID: ${{ needs.meta.outputs.release-pr-run-id }} run: | old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}" new_prefix="artifacts/latest" @@ -1294,7 +1349,8 @@ jobs: pin-build-tools-image: needs: [ build-build-tools-image, test-images, build-and-test-locally ] - if: github.ref_name == 'main' + # `!failure() && !cancelled()` is required because the job (transitively) depends on jobs that can be skipped + if: github.ref_name == 'main' && !failure() && !cancelled() uses: ./.github/workflows/pin-build-tools-image.yml with: from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} @@ -1313,6 +1369,7 @@ jobs: # Format `needs` differently to make the list more readable. # Usually we do `needs: [...]` needs: + - meta - build-and-test-locally - check-codestyle-python - check-codestyle-rust @@ -1336,7 +1393,7 @@ jobs: || needs.check-codestyle-python.result == 'skipped' || needs.check-codestyle-rust.result == 'skipped' || needs.files-changed.result == 'skipped' - || needs.push-compute-image-dev.result == 'skipped' - || needs.push-neon-image-dev.result == 'skipped' - || needs.test-images.result == 'skipped' - || needs.trigger-custom-extensions-build-and-wait.result == 'skipped' + || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.test-images.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) diff --git a/.github/workflows/cargo-deny.yml b/.github/workflows/cargo-deny.yml index 433b377c32..222f7e9787 100644 --- a/.github/workflows/cargo-deny.yml +++ b/.github/workflows/cargo-deny.yml @@ -7,7 +7,7 @@ on: required: false type: string schedule: - - cron: '0 0 * * *' + - cron: '0 10 * * *' jobs: cargo-deny: @@ -50,8 +50,9 @@ jobs: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} payload: | - channel: ${{ vars.SLACK_CICD_CHANNEL_ID }} + channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }} text: | Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> - Pinging @oncall-devprod. + Fixing the problem should be fairly straight forward from the logs. If not, <#${{ vars.SLACK_RUST_CHANNEL_ID }}> is there to help. + Pinging . diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 09d6acd325..606e1c0862 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -38,6 +38,9 @@ jobs: runs-on: us-east-2 container: image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: diff --git a/.github/workflows/fast-forward.yml b/.github/workflows/fast-forward.yml new file mode 100644 index 0000000000..bc63ff120d --- /dev/null +++ b/.github/workflows/fast-forward.yml @@ -0,0 +1,36 @@ +name: Fast forward merge +on: + pull_request: + types: [labeled] + branches: + - release + - release-proxy + - release-compute + +jobs: + fast-forward: + if: ${{ github.event.label.name == 'fast-forward' }} + runs-on: ubuntu-22.04 + + steps: + - name: Remove fast-forward label to PR + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + gh pr edit ${{ github.event.pull_request.number }} --repo "${GITHUB_REPOSITORY}" --remove-label "fast-forward" + + - name: Fast forwarding + uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979 + # See https://docs.github.com/en/graphql/reference/enums#mergestatestatus + if: ${{ github.event.pull_request.mergeable_state == 'clean' }} + with: + merge: true + comment: on-error + github_token: ${{ secrets.CI_ACCESS_TOKEN }} + + - name: Comment if mergeable_state is not clean + if: ${{ github.event.pull_request.mergeable_state != 'clean' }} + run: | + gh pr comment ${{ github.event.pull_request.number }} \ + --repo "${GITHUB_REPOSITORY}" \ + --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`." diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml index 71c5158ef6..f2376306dc 100644 --- a/.github/workflows/force-test-extensions-upgrade.yml +++ b/.github/workflows/force-test-extensions-upgrade.yml @@ -52,8 +52,9 @@ jobs: - name: Test extension upgrade timeout-minutes: 20 env: - NEWTAG: latest - OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + NEW_COMPUTE_TAG: latest + OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} PG_VERSION: ${{ matrix.pg-version }} FORCE_ALL_UPGRADE_TESTS: true run: ./docker-compose/test_extensions_upgrade.sh diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml new file mode 100644 index 0000000000..a7c3118e34 --- /dev/null +++ b/.github/workflows/large_oltp_benchmark.yml @@ -0,0 +1,186 @@ +name: large oltp benchmark + +on: + # uncomment to run on push for debugging your PR + #push: + # branches: [ bodobolero/synthetic_oltp_workload ] + + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 15 * * 0,2,4' # run on Sunday, Tuesday, Thursday at 3 PM UTC + workflow_dispatch: # adds ability to run this manually + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow globally because we need dedicated resources which only exist once + group: large-oltp-bench-workflow + cancel-in-progress: false + +jobs: + oltp: + strategy: + fail-fast: false # allow other variants to continue even if one fails + matrix: + include: + - target: new_branch + custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 + - target: reuse_branch + custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 + max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h + TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }} + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + PG_VERSION: 16 # pre-determined by pre-determined project + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + PLATFORM: ${{ matrix.target }} + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + # Increase timeout to 2 days, default timeout is 6h - database maintenance can take a long time + # (normally 1h pgbench, 3h vacuum analyze 3.5h re-index) x 2 = 15h, leave some buffer for regressions + # in one run vacuum didn't finish within 12 hours + timeout-minutes: 2880 + + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials # necessary to download artefacts + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Create Neon Branch for large tenant + if: ${{ matrix.target == 'new_branch' }} + id: create-neon-branch-oltp-target + uses: ./.github/actions/neon-branch-create + with: + project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${{ matrix.target }}" in + new_branch) + CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }} + ;; + reuse_branch) + CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} + ;; + *) + echo >&2 "Unknown target=${{ matrix.target }}" + exit 1 + ;; + esac + + CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}" + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT + + - name: Delete rows from prior runs in reuse branch + if: ${{ matrix.target == 'reuse_branch' }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }} + PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config + PSQL: /tmp/neon/pg_install/v16/bin/psql + PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib + run: | + echo "$(date '+%Y-%m-%d %H:%M:%S') - Deleting rows in table webhook.incoming_webhooks from prior runs" + export LD_LIBRARY_PATH=${PG_16_LIB_PATH} + ${PSQL} "${BENCHMARK_CONNSTR}" -c "SET statement_timeout = 0; DELETE FROM webhook.incoming_webhooks WHERE created_at > '2025-02-27 23:59:59+00';" + echo "$(date '+%Y-%m-%d %H:%M:%S') - Finished deleting rows in table webhook.incoming_webhooks from prior runs" + + - name: Benchmark pgbench with custom-scripts + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_pgbench + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Benchmark database maintenance + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 172800 -k test_perf_oltp_large_tenant_maintenance + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Delete Neon Branch for large tenant + if: ${{ always() && matrix.target == 'new_branch' }} + uses: ./.github/actions/neon-branch-delete + with: + project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }} + branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Configure AWS credentials # again because prior steps could have exceeded 5 hours + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + + - name: Create Allure report + id: create-allure-report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + with: + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Periodic large oltp perf testing: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/lint-release-pr.yml b/.github/workflows/lint-release-pr.yml new file mode 100644 index 0000000000..b7d010f66d --- /dev/null +++ b/.github/workflows/lint-release-pr.yml @@ -0,0 +1,24 @@ +name: Lint Release PR + +on: + pull_request: + branches: + - release + - release-proxy + - release-compute + +jobs: + lint-release-pr: + runs-on: ubuntu-22.04 + steps: + - name: Checkout PR branch + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch full history for git operations + ref: ${{ github.event.pull_request.head.ref }} + + - name: Run lint script + env: + RELEASE_BRANCH: ${{ github.base_ref }} + run: | + ./.github/scripts/lint-release-pr.sh diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index f077e04d1c..90318747b3 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -71,7 +71,7 @@ jobs: uses: ./.github/workflows/build-macos.yml with: pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} - rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }} + rebuild_rust_code: ${{ fromJson(needs.files-changed.outputs.rebuild_rust_code) }} rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }} gather-rust-build-stats: diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index af877029e4..f854bf3212 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central on: schedule: # * is a special character in YAML so you have to quote this string - # ┌───────────── minute (0 - 59) - # │ ┌───────────── hour (0 - 23) - # │ │ ┌───────────── day of the month (1 - 31) - # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) - # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '0 18 * * *' # Runs at 6 PM UTC every day + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 */3 * * *' # Runs every 3 hours workflow_dispatch: # Allows manual triggering of the workflow inputs: commit_hash: @@ -78,8 +78,10 @@ jobs: run: | if [ -z "$INPUT_COMMIT_HASH" ]; then echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV else echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV fi - name: Start Bench with run_id @@ -89,7 +91,7 @@ jobs: -H 'accept: application/json' \ -H 'Content-Type: application/json' \ -H "Authorization: Bearer $API_KEY" \ - -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}" + -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}" - name: Poll Test Status id: poll_step diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index b305b662ee..d2588ba0bf 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -65,6 +65,7 @@ jobs: permissions: id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR uses: ./.github/workflows/_push-to-container-registry.yml with: @@ -72,12 +73,15 @@ jobs: { "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ "docker.io/neondatabase/build-tools:pinned-bullseye", + "ghcr.io/neondatabase/build-tools:pinned-bullseye", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye" ], "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ "docker.io/neondatabase/build-tools:pinned-bookworm", "docker.io/neondatabase/build-tools:pinned", + "ghcr.io/neondatabase/build-tools:pinned-bookworm", + "ghcr.io/neondatabase/build-tools:pinned", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm", @@ -85,12 +89,10 @@ jobs: ] } aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index c47b3fe0de..3bd81f6538 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -8,8 +8,6 @@ on: - .github/workflows/build-build-tools-image.yml - .github/workflows/pre-merge-checks.yml merge_group: - branches: - - main defaults: run: @@ -19,15 +17,17 @@ defaults: permissions: {} jobs: - get-changed-files: + meta: runs-on: ubuntu-22.04 outputs: python-changed: ${{ steps.python-src.outputs.any_changed }} rust-changed: ${{ steps.rust-src.outputs.any_changed }} + branch: ${{ steps.group-metadata.outputs.branch }} + pr-number: ${{ steps.group-metadata.outputs.pr-number }} steps: - uses: actions/checkout@v4 - - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 + - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 id: python-src with: files: | @@ -38,7 +38,7 @@ jobs: poetry.lock pyproject.toml - - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 + - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 id: rust-src with: files: | @@ -58,12 +58,20 @@ jobs: echo "${PYTHON_CHANGED_FILES}" echo "${RUST_CHANGED_FILES}" + - name: Merge group metadata + if: ${{ github.event_name == 'merge_group' }} + id: group-metadata + env: + MERGE_QUEUE_REF: ${{ github.event.merge_group.head_ref }} + run: | + echo $MERGE_QUEUE_REF | jq -Rr 'capture("refs/heads/gh-readonly-queue/(?.*)/pr-(?[0-9]+)-[0-9a-f]{40}") | ["branch=" + .branch, "pr-number=" + .pr_number] | .[]' | tee -a "${GITHUB_OUTPUT}" + build-build-tools-image: if: | false - || needs.get-changed-files.outputs.python-changed == 'true' - || needs.get-changed-files.outputs.rust-changed == 'true' - needs: [ get-changed-files ] + || needs.meta.outputs.python-changed == 'true' + || needs.meta.outputs.rust-changed == 'true' + needs: [ meta ] uses: ./.github/workflows/build-build-tools-image.yml with: # Build only one combination to save time @@ -72,8 +80,8 @@ jobs: secrets: inherit check-codestyle-python: - if: needs.get-changed-files.outputs.python-changed == 'true' - needs: [ get-changed-files, build-build-tools-image ] + if: needs.meta.outputs.python-changed == 'true' + needs: [ meta, build-build-tools-image ] uses: ./.github/workflows/_check-codestyle-python.yml with: # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` @@ -81,8 +89,8 @@ jobs: secrets: inherit check-codestyle-rust: - if: needs.get-changed-files.outputs.rust-changed == 'true' - needs: [ get-changed-files, build-build-tools-image ] + if: needs.meta.outputs.rust-changed == 'true' + needs: [ meta, build-build-tools-image ] uses: ./.github/workflows/_check-codestyle-rust.yml with: # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` @@ -101,7 +109,7 @@ jobs: statuses: write # for `github.repos.createCommitStatus(...)` contents: write needs: - - get-changed-files + - meta - check-codestyle-python - check-codestyle-rust runs-on: ubuntu-22.04 @@ -129,7 +137,20 @@ jobs: run: exit 1 if: | false - || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true') - || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true') + || (github.event_name == 'merge_group' && needs.meta.outputs.branch != 'main') + || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.python-changed == 'true') + || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.rust-changed == 'true') || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') + + - name: Add fast-forward label to PR to trigger fast-forward merge + if: >- + ${{ + always() + && github.event_name == 'merge_group' + && contains(fromJson('["release", "release-proxy", "release-compute"]'), needs.meta.outputs.branch) + }} + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: >- + gh pr edit ${{ needs.meta.outputs.pr-number }} --repo "${GITHUB_REPOSITORY}" --add-label "fast-forward" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 919846ce44..a88ddecd0a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -38,7 +38,7 @@ jobs: uses: ./.github/workflows/_create-release-pr.yml with: component-name: 'Storage' - release-branch: 'release' + source-branch: ${{ github.ref_name }} secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} @@ -51,7 +51,7 @@ jobs: uses: ./.github/workflows/_create-release-pr.yml with: component-name: 'Proxy' - release-branch: 'release-proxy' + source-branch: ${{ github.ref_name }} secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} @@ -64,6 +64,6 @@ jobs: uses: ./.github/workflows/_create-release-pr.yml with: component-name: 'Compute' - release-branch: 'release-compute' + source-branch: ${{ github.ref_name }} secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index be6a7a7901..a30da35502 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -5,6 +5,10 @@ on: types: - ready_for_review workflow_call: + inputs: + github-event-name: + type: string + required: true defaults: run: @@ -19,7 +23,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: - github-event-name: ${{ github.event_name }} + github-event-name: ${{ inputs.github-event-name || github.event_name }} cancel-previous-e2e-tests: needs: [ check-permissions ] @@ -35,46 +39,29 @@ jobs: run cancel-previous-in-concurrency-group.yml \ --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" - tag: - needs: [ check-permissions ] - runs-on: ubuntu-22.04 - outputs: - build-tag: ${{ steps.build-tag.outputs.tag }} - - steps: - # Need `fetch-depth: 0` to count the number of commits in the branch - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get build tag - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} - CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then - echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') - echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT - fi - id: build-tag + meta: + uses: ./.github/workflows/_meta.yml + with: + github-event-name: ${{ inputs.github-event-name || github.event_name }} trigger-e2e-tests: - needs: [ tag ] + needs: [ meta ] runs-on: ubuntu-22.04 env: EVENT_ACTION: ${{ github.event.action }} GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - TAG: ${{ needs.tag.outputs.build-tag }} + TAG: >- + ${{ + contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.build-tag + }} + COMPUTE_TAG: >- + ${{ + contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-compute-release + || needs.meta.outputs.build-tag + }} steps: - name: Wait for `push-{neon,compute}-image-dev` job to finish # It's important to have a timeout here, the script in the step can run infinitely @@ -157,6 +144,6 @@ jobs: --raw-field "commit_hash=$COMMIT_SHA" \ --raw-field "remote_repo=${GITHUB_REPOSITORY}" \ --raw-field "storage_image_tag=${TAG}" \ - --raw-field "compute_image_tag=${TAG}" \ + --raw-field "compute_image_tag=${COMPUTE_TAG}" \ --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \ --raw-field "e2e-platforms=${E2E_PLATFORMS}" diff --git a/CODEOWNERS b/CODEOWNERS index 71b5e65f94..2a112d9728 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,8 +1,8 @@ # Autoscaling /libs/vm_monitor/ @neondatabase/autoscaling -# DevProd -/.github/ @neondatabase/developer-productivity +# DevProd & PerfCorr +/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness # Compute /pgxn/ @neondatabase/compute diff --git a/Cargo.lock b/Cargo.lock index 47552174d2..39ce785a4e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -191,7 +191,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "synstructure", ] @@ -203,7 +203,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -272,7 +272,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -283,7 +283,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -783,6 +783,28 @@ dependencies = [ "tracing", ] +[[package]] +name = "axum-extra" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b" +dependencies = [ + "axum", + "axum-core", + "bytes", + "futures-util", + "headers", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "mime", + "pin-project-lite", + "serde", + "tower 0.5.2", + "tower-layer", + "tower-service", +] + [[package]] name = "azure_core" version = "0.21.0" @@ -925,9 +947,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5" [[package]] name = "base64" -version = "0.21.1" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "base64" @@ -984,9 +1006,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.70.1" +version = "0.71.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" dependencies = [ "bitflags 2.8.0", "cexpr", @@ -997,9 +1019,9 @@ dependencies = [ "proc-macro2", "quote", "regex", - "rustc-hash", + "rustc-hash 2.1.1", "shlex", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1105,9 +1127,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.30" +version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" dependencies = [ "jobserver", "libc", @@ -1226,7 +1248,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1287,6 +1309,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", + "indexmap 2.0.1", "jsonwebtoken", "regex", "remote_storage", @@ -1305,6 +1328,7 @@ dependencies = [ "aws-sdk-s3", "aws-smithy-types", "axum", + "axum-extra", "base64 0.13.1", "bytes", "camino", @@ -1316,6 +1340,8 @@ dependencies = [ "flate2", "futures", "http 1.1.0", + "indexmap 2.0.1", + "jsonwebtoken", "metrics", "nix 0.27.1", "notify", @@ -1323,17 +1349,20 @@ dependencies = [ "once_cell", "opentelemetry", "opentelemetry_sdk", + "p256 0.13.2", "postgres", "postgres_initdb", "regex", "remote_storage", "reqwest", + "ring", "rlimit", "rust-ini", "serde", "serde_json", "serde_with", "signal-hook", + "spki 0.7.3", "tar", "thiserror 1.0.69", "tokio", @@ -1342,7 +1371,9 @@ dependencies = [ "tokio-util", "tower 0.5.2", "tower-http", + "tower-otel", "tracing", + "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", @@ -1351,6 +1382,7 @@ dependencies = [ "vm_monitor", "walkdir", "workspace_hack", + "x509-cert", "zstd", ] @@ -1677,7 +1709,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1701,7 +1733,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.10.0", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1712,7 +1744,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1775,6 +1807,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" dependencies = [ "const-oid", + "der_derive", + "flagset", "pem-rfc7468", "zeroize", ] @@ -1793,6 +1827,17 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "der_derive" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "deranged" version = "0.3.11" @@ -1862,7 +1907,7 @@ dependencies = [ "dsl_auto_type", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1882,7 +1927,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1911,7 +1956,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1934,7 +1979,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2079,7 +2124,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2089,28 +2134,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" dependencies = [ "log", -] - -[[package]] -name = "env_logger" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" -dependencies = [ - "humantime", - "is-terminal", - "log", "regex", - "termcolor", ] [[package]] name = "env_logger" -version = "0.11.2" +version = "0.11.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +checksum = "c3716d7a920fb4fac5d84e9d4bce8ceb321e9414b4409da61b07b75c1e3d0697" dependencies = [ + "anstream", + "anstyle", "env_filter", + "jiff", "log", ] @@ -2131,7 +2167,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2265,6 +2301,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "flagset" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3ea1ec5f8307826a5b71094dd91fc04d4ae75d5709b20ad351c7fb4815c86ec" + [[package]] name = "flate2" version = "1.0.26" @@ -2295,7 +2337,7 @@ name = "framed-websockets" version = "0.1.0" source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "bytemuck", "bytes", "futures-core", @@ -2391,7 +2433,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2408,9 +2450,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" -version = "3.0.2" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] name = "futures-util" @@ -2504,7 +2546,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2513,6 +2555,27 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "governor" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0" +dependencies = [ + "cfg-if", + "dashmap 6.1.0", + "futures-sink", + "futures-timer", + "futures-util", + "no-std-compat", + "nonzero_ext", + "parking_lot 0.12.1", + "portable-atomic", + "quanta", + "rand 0.8.5", + "smallvec", + "spinning_top", +] + [[package]] name = "group" version = "0.12.1" @@ -2630,7 +2693,7 @@ version = "7.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "byteorder", "crossbeam-channel", "flate2", @@ -2638,6 +2701,30 @@ dependencies = [ "num-traits", ] +[[package]] +name = "headers" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9" +dependencies = [ + "base64 0.21.7", + "bytes", + "headers-core", + "http 1.1.0", + "httpdate", + "mime", + "sha1", +] + +[[package]] +name = "headers-core" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4" +dependencies = [ + "http 1.1.0", +] + [[package]] name = "heck" version = "0.5.0" @@ -2775,12 +2862,11 @@ name = "http-utils" version = "0.1.0" dependencies = [ "anyhow", - "backtrace", "bytes", + "camino", "fail", - "flate2", + "futures", "hyper 0.14.30", - "inferno 0.12.0", "itertools 0.10.5", "jemalloc_pprof", "metrics", @@ -2788,11 +2874,13 @@ dependencies = [ "pprof", "regex", "routerify", + "rustls-pemfile 2.1.1", "serde", "serde_json", "serde_path_to_error", "thiserror 1.0.69", "tokio", + "tokio-rustls 0.26.0", "tokio-stream", "tokio-util", "tracing", @@ -2816,9 +2904,9 @@ checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" [[package]] name = "humantime" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" [[package]] name = "humantime-serde" @@ -3078,7 +3166,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -3171,7 +3259,7 @@ dependencies = [ "crossbeam-channel", "crossbeam-utils", "dashmap 6.1.0", - "env_logger 0.11.2", + "env_logger", "indexmap 2.0.1", "itoa", "log", @@ -3184,11 +3272,11 @@ dependencies = [ [[package]] name = "inotify" -version = "0.11.0" +version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" dependencies = [ - "bitflags 2.8.0", + "bitflags 1.3.2", "inotify-sys", "libc", ] @@ -3279,9 +3367,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jemalloc_pprof" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb" +checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992" dependencies = [ "anyhow", "libc", @@ -3294,6 +3382,30 @@ dependencies = [ "tracing", ] +[[package]] +name = "jiff" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d699bc6dfc879fb1bf9bdff0d4c56f0884fc6f0d0eb0fba397a6d00cd9a6b85e" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", +] + +[[package]] +name = "jiff-static" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d16e75759ee0aa64c57a56acbf43916987b20c77373cb7e808979e02b93c9f9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -3365,7 +3477,7 @@ version = "9.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "js-sys", "pem", "ring", @@ -3465,9 +3577,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.20" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" [[package]] name = "lru" @@ -3480,9 +3592,9 @@ dependencies = [ [[package]] name = "mappings" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e" +checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a" dependencies = [ "anyhow", "libc", @@ -3535,7 +3647,7 @@ dependencies = [ "measured-derive", "memchr", "parking_lot 0.12.1", - "rustc-hash", + "rustc-hash 1.1.0", "ryu", ] @@ -3548,7 +3660,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -3662,6 +3774,18 @@ dependencies = [ "adler2", ] +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "log", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.48.0", +] + [[package]] name = "mio" version = "1.0.3" @@ -3669,7 +3793,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", - "log", "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] @@ -3723,6 +3846,12 @@ dependencies = [ "memoffset 0.9.0", ] +[[package]] +name = "no-std-compat" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c" + [[package]] name = "nom" version = "7.1.3" @@ -3734,30 +3863,30 @@ dependencies = [ ] [[package]] -name = "notify" -version = "8.0.0" +name = "nonzero_ext" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943" +checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21" + +[[package]] +name = "notify" +version = "6.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d" dependencies = [ "bitflags 2.8.0", + "crossbeam-channel", "filetime", "fsevent-sys", "inotify", "kqueue", "libc", "log", - "mio", - "notify-types", + "mio 0.8.11", "walkdir", - "windows-sys 0.59.0", + "windows-sys 0.48.0", ] -[[package]] -name = "notify-types" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d" - [[package]] name = "ntapi" version = "0.4.1" @@ -3982,7 +4111,7 @@ dependencies = [ "opentelemetry-http", "opentelemetry-proto", "opentelemetry_sdk", - "prost", + "prost 0.13.3", "reqwest", "thiserror 1.0.69", ] @@ -3995,7 +4124,7 @@ checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6" dependencies = [ "opentelemetry", "opentelemetry_sdk", - "prost", + "prost 0.13.3", "tonic", ] @@ -4109,6 +4238,7 @@ dependencies = [ "pageserver_api", "pageserver_client", "rand 0.8.5", + "reqwest", "serde", "serde_json", "tokio", @@ -4198,6 +4328,7 @@ dependencies = [ "remote_storage", "reqwest", "rpds", + "rustls 0.23.18", "scopeguard", "send-future", "serde", @@ -4216,13 +4347,16 @@ dependencies = [ "tokio-epoll-uring", "tokio-io-timeout", "tokio-postgres", + "tokio-rustls 0.26.0", "tokio-stream", "tokio-tar", "tokio-util", "toml_edit", "tracing", + "tracing-utils", "url", "utils", + "uuid", "wal_decoder", "walkdir", "workspace_hack", @@ -4305,9 +4439,9 @@ dependencies = [ [[package]] name = "papaya" -version = "0.1.8" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c" +checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd" dependencies = [ "equivalent", "seize", @@ -4397,7 +4531,7 @@ dependencies = [ "parquet", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -4435,7 +4569,7 @@ version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "serde", ] @@ -4484,22 +4618,22 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.0" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead" +checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.0" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" +checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -4589,6 +4723,21 @@ dependencies = [ "never-say-never", ] +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "postgres" version = "0.19.7" @@ -4696,7 +4845,7 @@ dependencies = [ "bytes", "crc32c", "criterion", - "env_logger 0.10.2", + "env_logger", "log", "memoffset 0.9.0", "once_cell", @@ -4743,8 +4892,10 @@ dependencies = [ "nix 0.26.4", "once_cell", "parking_lot 0.12.1", - "protobuf", - "protobuf-codegen-pure", + "prost 0.12.6", + "prost-build 0.12.6", + "prost-derive 0.12.6", + "sha2", "smallvec", "symbolic-demangle", "tempfile", @@ -4753,15 +4904,17 @@ dependencies = [ [[package]] name = "pprof_util" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781" +checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416" dependencies = [ "anyhow", + "backtrace", "flate2", + "inferno 0.12.0", "num", "paste", - "prost", + "prost 0.13.3", ] [[package]] @@ -4791,7 +4944,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -4805,9 +4958,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.92" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" dependencies = [ "unicode-ident", ] @@ -4854,6 +5007,16 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "prost" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +dependencies = [ + "bytes", + "prost-derive 0.12.6", +] + [[package]] name = "prost" version = "0.13.3" @@ -4861,7 +5024,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.13.3", +] + +[[package]] +name = "prost-build" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" +dependencies = [ + "bytes", + "heck", + "itertools 0.12.1", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost 0.12.6", + "prost-types 0.12.6", + "regex", + "syn 2.0.100", + "tempfile", ] [[package]] @@ -4878,13 +5062,26 @@ dependencies = [ "once_cell", "petgraph", "prettyplease", - "prost", - "prost-types", + "prost 0.13.3", + "prost-types 0.13.3", "regex", - "syn 2.0.90", + "syn 2.0.100", "tempfile", ] +[[package]] +name = "prost-derive" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +dependencies = [ + "anyhow", + "itertools 0.12.1", + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "prost-derive" version = "0.13.3" @@ -4895,7 +5092,16 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", +] + +[[package]] +name = "prost-types" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +dependencies = [ + "prost 0.12.6", ] [[package]] @@ -4904,32 +5110,7 @@ version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" dependencies = [ - "prost", -] - -[[package]] -name = "protobuf" -version = "2.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" - -[[package]] -name = "protobuf-codegen" -version = "2.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6" -dependencies = [ - "protobuf", -] - -[[package]] -name = "protobuf-codegen-pure" -version = "2.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865" -dependencies = [ - "protobuf", - "protobuf-codegen", + "prost 0.13.3", ] [[package]] @@ -4958,7 +5139,7 @@ dependencies = [ "consumption_metrics", "ecdsa 0.16.9", "ed25519-dalek", - "env_logger 0.10.2", + "env_logger", "fallible-iterator", "flate2", "framed-websockets", @@ -5010,7 +5191,7 @@ dependencies = [ "reqwest-tracing", "rsa", "rstest", - "rustc-hash", + "rustc-hash 1.1.0", "rustls 0.23.18", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", @@ -5050,6 +5231,21 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "quanta" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi 0.11.0+wasi-snapshot-preview1", + "web-sys", + "winapi", +] + [[package]] name = "quick-xml" version = "0.26.0" @@ -5080,9 +5276,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.37" +version = "1.0.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" dependencies = [ "proc-macro2", ] @@ -5180,6 +5376,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "raw-cpuid" +version = "11.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e" +dependencies = [ + "bitflags 2.8.0", +] + [[package]] name = "rayon" version = "1.7.0" @@ -5514,16 +5719,16 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.6" +version = "0.17.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "684d5e6e18f669ccebf64a92236bb7db9a34f07be010e3627368182027180866" +checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee" dependencies = [ "cc", + "cfg-if", "getrandom 0.2.11", "libc", - "spin", "untrusted", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -5602,7 +5807,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.90", + "syn 2.0.100", "unicode-ident", ] @@ -5628,6 +5833,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustc_version" version = "0.4.0" @@ -5744,7 +5955,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", ] [[package]] @@ -5753,15 +5964,15 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "rustls-pki-types", ] [[package]] name = "rustls-pki-types" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" [[package]] name = "rustls-webpki" @@ -5811,7 +6022,7 @@ dependencies = [ "crc32c", "criterion", "desim", - "env_logger 0.10.2", + "env_logger", "fail", "futures", "hex", @@ -5833,6 +6044,7 @@ dependencies = [ "regex", "remote_storage", "reqwest", + "rustls 0.23.18", "safekeeper_api", "safekeeper_client", "scopeguard", @@ -5849,6 +6061,7 @@ dependencies = [ "tokio", "tokio-io-timeout", "tokio-postgres", + "tokio-rustls 0.26.0", "tokio-stream", "tokio-tar", "tokio-util", @@ -5992,9 +6205,9 @@ dependencies = [ [[package]] name = "seize" -version = "0.4.9" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93" +checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7" dependencies = [ "libc", "windows-sys 0.52.0", @@ -6142,7 +6355,7 @@ checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6224,7 +6437,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6239,9 +6452,9 @@ dependencies = [ [[package]] name = "sha1" -version = "0.10.5" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", "cpufeatures", @@ -6387,6 +6600,15 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "spinning_top" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300" +dependencies = [ + "lock_api", +] + [[package]] name = "spki" version = "0.6.0" @@ -6438,7 +6660,7 @@ dependencies = [ "metrics", "once_cell", "parking_lot 0.12.1", - "prost", + "prost 0.13.3", "rustls 0.23.18", "tokio", "tonic", @@ -6456,6 +6678,7 @@ dependencies = [ "bytes", "chrono", "clap", + "clashmap", "control_plane", "cron", "diesel", @@ -6463,6 +6686,7 @@ dependencies = [ "diesel_migrations", "fail", "futures", + "governor", "hex", "http-utils", "humantime", @@ -6615,7 +6839,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6666,9 +6890,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.90" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", @@ -6698,7 +6922,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6749,15 +6973,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "termcolor" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" -dependencies = [ - "winapi-util", -] - [[package]] name = "test-context" version = "0.3.0" @@ -6776,7 +6991,7 @@ checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6805,7 +7020,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6816,7 +7031,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6947,6 +7162,27 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tls_codec" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de2e01245e2bb89d6f05801c564fa27624dbd7b1846859876c7dad82e90bf6b" +dependencies = [ + "tls_codec_derive", + "zeroize", +] + +[[package]] +name = "tls_codec_derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2e76690929402faae40aebdda620a2c0e25dd6d3b9afe48867dfd95991f4bd" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "tokio" version = "1.43.0" @@ -6956,7 +7192,7 @@ dependencies = [ "backtrace", "bytes", "libc", - "mio", + "mio 1.0.3", "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", @@ -6999,7 +7235,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7209,7 +7445,7 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost", + "prost 0.13.3", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", "tokio", @@ -7229,10 +7465,10 @@ checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" dependencies = [ "prettyplease", "proc-macro2", - "prost-build", - "prost-types", + "prost-build 0.13.3", + "prost-types 0.13.3", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7277,10 +7513,12 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697" dependencies = [ + "base64 0.22.1", "bitflags 2.8.0", "bytes", "http 1.1.0", "http-body 1.0.0", + "mime", "pin-project-lite", "tower-layer", "tower-service", @@ -7294,6 +7532,20 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" +[[package]] +name = "tower-otel" +version = "0.2.0" +source = "git+https://github.com/mattiapenati/tower-otel?rev=56a7321053bcb72443888257b622ba0d43a11fcd#56a7321053bcb72443888257b622ba0d43a11fcd" +dependencies = [ + "http 1.1.0", + "opentelemetry", + "pin-project", + "tower-layer", + "tower-service", + "tracing", + "tracing-opentelemetry", +] + [[package]] name = "tower-service" version = "0.3.3" @@ -7331,7 +7583,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7620,7 +7872,6 @@ dependencies = [ "anyhow", "arc-swap", "async-compression", - "backtrace", "bincode", "byteorder", "bytes", @@ -7663,6 +7914,7 @@ dependencies = [ "tracing", "tracing-error", "tracing-subscriber", + "tracing-utils", "walkdir", ] @@ -7726,7 +7978,7 @@ dependencies = [ "anyhow", "camino-tempfile", "clap", - "env_logger 0.10.2", + "env_logger", "log", "postgres", "postgres_ffi", @@ -7748,7 +8000,7 @@ dependencies = [ "pageserver_api", "postgres_ffi", "pprof", - "prost", + "prost 0.13.3", "remote_storage", "serde", "serde_json", @@ -7831,7 +8083,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "wasm-bindgen-shared", ] @@ -7865,7 +8117,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -8172,9 +8424,10 @@ name = "workspace_hack" version = "0.1.0" dependencies = [ "ahash", + "anstream", "anyhow", "base64 0.13.1", - "base64 0.21.1", + "base64 0.21.7", "base64ct", "bytes", "camino", @@ -8182,12 +8435,17 @@ dependencies = [ "chrono", "clap", "clap_builder", + "const-oid", "crypto-bigint 0.5.5", "der 0.7.8", "deranged", "digest", "displaydoc", + "ecdsa 0.16.9", "either", + "elliptic-curve 0.13.8", + "env_filter", + "env_logger", "fail", "form_urlencoded", "futures-channel", @@ -8220,10 +8478,11 @@ dependencies = [ "num-rational", "num-traits", "once_cell", + "p256 0.13.2", "parquet", "prettyplease", "proc-macro2", - "prost", + "prost 0.13.3", "quote", "rand 0.8.5", "regex", @@ -8232,6 +8491,7 @@ dependencies = [ "reqwest", "rustls 0.23.18", "scopeguard", + "sec1 0.7.3", "serde", "serde_json", "sha2", @@ -8240,7 +8500,7 @@ dependencies = [ "spki 0.7.3", "stable_deref_trait", "subtle", - "syn 2.0.90", + "syn 2.0.100", "sync_wrapper 0.1.2", "tikv-jemalloc-ctl", "tikv-jemalloc-sys", @@ -8277,6 +8537,18 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" +[[package]] +name = "x509-cert" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1301e935010a701ae5f8655edc0ad17c44bad3ac5ce8c39185f75453b720ae94" +dependencies = [ + "const-oid", + "der 0.7.8", + "spki 0.7.3", + "tls_codec", +] + [[package]] name = "x509-certificate" version = "0.23.1" @@ -8357,7 +8629,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "synstructure", ] @@ -8379,7 +8651,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -8399,15 +8671,15 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "synstructure", ] [[package]] name = "zeroize" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" dependencies = [ "serde", "zeroize_derive", @@ -8421,7 +8693,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -8443,7 +8715,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index e6ca3c982c..f2a94d2371 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,7 +43,7 @@ members = [ ] [workspace.package] -edition = "2021" +edition = "2024" license = "Apache-2.0" ## All dependency versions, used in the project @@ -53,7 +53,6 @@ anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } atomic-take = "1.1.0" -backtrace = "0.3.74" flate2 = "1.0.26" assert-json-diff = "2" async-stream = "0.3" @@ -68,9 +67,10 @@ aws-credential-types = "1.2.0" aws-sigv4 = { version = "1.2", features = ["sign-http"] } aws-types = "1.3" axum = { version = "0.8.1", features = ["ws"] } +axum-extra = { version = "0.10.0", features = ["typed-header"] } base64 = "0.13.0" bincode = "1.3" -bindgen = "0.70" +bindgen = "0.71" bit_field = "0.10.2" bstr = "1.0" byteorder = "1.4" @@ -95,6 +95,7 @@ futures = "0.3" futures-core = "0.3" futures-util = "0.3" git-version = "0.3" +governor = "0.8" hashbrown = "0.14" hashlink = "0.9.1" hdrhistogram = "7.5.2" @@ -105,19 +106,18 @@ hostname = "0.4" http = {version = "1.1.0", features = ["std"]} http-types = { version = "2", default-features = false } http-body-util = "0.1.2" -humantime = "2.1" +humantime = "2.2" humantime-serde = "1.1.1" hyper0 = { package = "hyper", version = "0.14" } hyper = "1.4" hyper-util = "0.1" tokio-tungstenite = "0.21.0" -indexmap = "2" +indexmap = { version = "2", features = ["serde"] } indoc = "2" -inferno = "0.12.0" ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" -jemalloc_pprof = "0.6" +jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] } jsonwebtoken = "9" lasso = "0.7" libc = "0.2" @@ -126,7 +126,9 @@ measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } -notify = "8.0.0" +# Do not update to >= 7.0.0, at least. The update will have a significant impact +# on compute startup metrics (start_postgres_ms), >= 25% degradation. +notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" @@ -139,7 +141,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" -pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] } +pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] } procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13" @@ -155,6 +157,7 @@ rpds = "0.13" rustc-hash = "1.1.0" rustls = { version = "0.23.16", default-features = false } rustls-pemfile = "2" +rustls-pki-types = "1.11" scopeguard = "1.1" sysinfo = "0.29.2" sd-notify = "0.4.1" @@ -192,7 +195,11 @@ toml = "0.8" toml_edit = "0.22" tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} tower = { version = "0.5.2", default-features = false } -tower-http = { version = "0.6.2", features = ["request-id", "trace"] } +tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] } + +# This revision uses opentelemetry 0.27. There's no tag for it. +tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" } + tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" @@ -214,7 +221,7 @@ zerocopy = { version = "0.7", features = ["derive"] } json-structural-diff = { version = "0.2.0" } ## TODO replace this with tracing -env_logger = "0.10" +env_logger = "0.11" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed diff --git a/Makefile b/Makefile index 42ee643bb5..0911465fb8 100644 --- a/Makefile +++ b/Makefile @@ -11,15 +11,16 @@ ICU_PREFIX_DIR := /usr/local/icu # BUILD_TYPE ?= debug WITH_SANITIZERS ?= no +PG_CFLAGS = -fsigned-char ifeq ($(BUILD_TYPE),release) PG_CONFIGURE_OPTS = --enable-debug --with-openssl - PG_CFLAGS = -O2 -g3 $(CFLAGS) + PG_CFLAGS += -O2 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) # Unfortunately, `--profile=...` is a nightly feature CARGO_BUILD_FLAGS += --release else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend - PG_CFLAGS = -O0 -g3 $(CFLAGS) + PG_CFLAGS += -O0 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) @@ -159,6 +160,8 @@ postgres-%: postgres-configure-% \ $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install +@echo "Compiling pageinspect $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install + +@echo "Compiling pg_trgm $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install +@echo "Compiling amcheck $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install +@echo "Compiling test_decoding $*" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index ef4c22612d..d5483018b4 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -162,7 +162,7 @@ FROM build-deps AS pg-build ARG PG_VERSION COPY vendor/postgres-${PG_VERSION:?} postgres RUN cd postgres && \ - export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \ + export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \ --with-icu --with-libxml --with-libxslt --with-lz4" && \ if [ "${PG_VERSION:?}" != "v14" ]; then \ # zstd is available only from PG15 @@ -1458,9 +1458,11 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ FROM build-deps AS pg_mooncake-src ARG PG_VERSION WORKDIR /ext-src +COPY compute/patches/duckdb_v113.patch . RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \ echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \ mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \ + cd third_party/duckdb && patch -p1 < /ext-src/duckdb_v113.patch && cd ../.. && \ echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \ chmod a+x neon-test.sh @@ -1480,22 +1482,25 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \ FROM build-deps AS pg_duckdb-src WORKDIR /ext-src COPY compute/patches/pg_duckdb_v031.patch . +COPY compute/patches/duckdb_v120.patch . # pg_duckdb build requires source dir to be a git repo to get submodules -# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: +# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: # - extension management function duckdb.install_extension() # - access to duckdb.extensions table and its sequence RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ cd pg_duckdb-src && \ git submodule update --init --recursive && \ - patch -p1 < /ext-src/pg_duckdb_v031.patch + patch -p1 < /ext-src/pg_duckdb_v031.patch && \ + cd third_party/duckdb && \ + patch -p1 < /ext-src/duckdb_v120.patch FROM pg-build AS pg_duckdb-build ARG PG_VERSION COPY --from=pg_duckdb-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_duckdb-src RUN make install -j $(getconf _NPROCESSORS_ONLN) && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control - + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control + ######################################################################################### # # Layer "pg_repack" @@ -1676,11 +1681,7 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ - -# Disabled temporarily, because it clashed with pg_mooncake. pg_mooncake -# also depends on libduckdb, but a different version. -#COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/ - +COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1734,6 +1735,8 @@ RUN set -e \ libevent-dev \ libtool \ pkg-config \ + libcurl4-openssl-dev \ + libssl-dev \ && apt clean && rm -rf /var/lib/apt/lists/* # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) @@ -1742,7 +1745,7 @@ RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ - && ./configure --prefix=/usr/local/pgbouncer --without-openssl \ + && ./configure --prefix=/usr/local/pgbouncer \ && make -j $(nproc) dist_man_MANS= \ && make install dist_man_MANS= @@ -1757,15 +1760,15 @@ ARG TARGETARCH # test_runner/regress/test_compute_metrics.py # See comment on the top of the file regading `echo`, `-e` and `\n` RUN if [ "$TARGETARCH" = "amd64" ]; then\ - postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\ + postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\ pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\ else\ - postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\ + postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\ pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\ sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\ fi\ - && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\ + && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ @@ -1817,7 +1820,7 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute FROM pg-build AS extension-tests ARG PG_VERSION -RUN mkdir /ext-src +COPY docker-compose/ext-src/ /ext-src/ COPY --from=pg-build /postgres /postgres #COPY --from=postgis-src /ext-src/ /ext-src/ @@ -1932,6 +1935,7 @@ RUN apt update && \ locales \ procps \ ca-certificates \ + rsyslog \ $VERSION_INSTALLS && \ apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 @@ -1977,6 +1981,13 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo # Make the libraries we built available RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig +# rsyslog config permissions +# directory for rsyslogd pid file +RUN mkdir /var/run/rsyslogd && \ + chown -R postgres:postgres /var/run/rsyslogd && \ + chown -R postgres:postgres /etc/rsyslog.d/ + + ENV LANG=en_US.utf8 USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index f8f4cab63b..da2b86d542 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -29,6 +29,7 @@ import 'sql_exporter/lfc_approximate_working_set_size.libsonnet', import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet', import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_chunk_size.libsonnet', import 'sql_exporter/lfc_hits.libsonnet', import 'sql_exporter/lfc_misses.libsonnet', import 'sql_exporter/lfc_used.libsonnet', diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql index 9cbbdfd8a3..fe0360ab5c 100644 --- a/compute/etc/sql_exporter/db_total_size.sql +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -1 +1,5 @@ -SELECT sum(pg_database_size(datname)) AS total FROM pg_database; +SELECT sum(pg_database_size(datname)) AS total +FROM pg_database +-- Ignore invalid databases, as we will likely have problems with +-- getting their size from the Pageserver. +WHERE datconnlimit != -2; diff --git a/compute/etc/sql_exporter/lfc_chunk_size.libsonnet b/compute/etc/sql_exporter/lfc_chunk_size.libsonnet new file mode 100644 index 0000000000..bbe56f869f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_chunk_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_chunk_size', + type: 'gauge', + help: 'LFC chunk size, measured in 8KiB pages', + key_labels: null, + values: [ + 'lfc_chunk_size_pages', + ], + query: importstr 'sql_exporter/lfc_chunk_size.sql', +} diff --git a/compute/etc/sql_exporter/lfc_chunk_size.sql b/compute/etc/sql_exporter/lfc_chunk_size.sql new file mode 100644 index 0000000000..0905870064 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_chunk_size.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages'; diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql index 00ada87370..12e6c4ae59 100644 --- a/compute/etc/sql_exporter/pg_stats_userdb.sql +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -1,10 +1,20 @@ -- We export stats for 10 non-system databases. Without this limit it is too -- easy to abuse the system by creating lots of databases. -SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, - tup_updated AS updated, tup_deleted AS deleted, datname +SELECT pg_database_size(datname) AS db_size, + deadlocks, + tup_inserted AS inserted, + tup_updated AS updated, + tup_deleted AS deleted, + datname FROM pg_stat_database WHERE datname IN ( SELECT datname FROM pg_database - WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10 + -- Ignore invalid databases, as we will likely have problems with + -- getting their size from the Pageserver. + WHERE datconnlimit != -2 + AND datname <> 'postgres' + AND NOT datistemplate + ORDER BY oid + LIMIT 10 ); diff --git a/compute/patches/duckdb_v113.patch b/compute/patches/duckdb_v113.patch new file mode 100644 index 0000000000..b7b43b88bf --- /dev/null +++ b/compute/patches/duckdb_v113.patch @@ -0,0 +1,25 @@ +diff --git a/libduckdb.map b/libduckdb.map +new file mode 100644 +index 0000000000..3b56f00cd7 +--- /dev/null ++++ b/libduckdb.map +@@ -0,0 +1,6 @@ ++DUCKDB_1.1.3 { ++ global: ++ *duckdb*; ++ local: ++ *; ++}; +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 3e757a4bcc..88ab4005b9 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -135,6 +135,8 @@ else() + target_link_libraries(duckdb ${DUCKDB_LINK_LIBS}) + link_threads(duckdb) + link_extension_libraries(duckdb) ++ target_link_options(duckdb PRIVATE ++ -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb.map) + + add_library(duckdb_static STATIC ${ALL_OBJECT_FILES}) + target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS}) diff --git a/compute/patches/duckdb_v120.patch b/compute/patches/duckdb_v120.patch new file mode 100644 index 0000000000..cf317736a5 --- /dev/null +++ b/compute/patches/duckdb_v120.patch @@ -0,0 +1,67 @@ +diff --git a/libduckdb_pg_duckdb.map b/libduckdb_pg_duckdb.map +new file mode 100644 +index 0000000000..0872978b48 +--- /dev/null ++++ b/libduckdb_pg_duckdb.map +@@ -0,0 +1,6 @@ ++DUCKDB_1.2.0 { ++ global: ++ *duckdb*; ++ local: ++ *; ++}; +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 58adef3fc0..2c522f91be 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -59,7 +59,7 @@ endfunction() + + if(AMALGAMATION_BUILD) + +- add_library(duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp") ++ add_library(duckdb_pg_duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp") + target_link_libraries(duckdb ${DUCKDB_SYSTEM_LIBS}) + link_threads(duckdb) + link_extension_libraries(duckdb) +@@ -109,7 +109,7 @@ else() + duckdb_yyjson + duckdb_zstd) + +- add_library(duckdb SHARED ${ALL_OBJECT_FILES}) ++ add_library(duckdb_pg_duckdb SHARED ${ALL_OBJECT_FILES}) + + if(WIN32 AND NOT MINGW) + ensure_variable_is_number(DUCKDB_MAJOR_VERSION RC_MAJOR_VERSION) +@@ -131,9 +131,11 @@ else() + target_sources(duckdb PRIVATE version.rc) + endif() + +- target_link_libraries(duckdb ${DUCKDB_LINK_LIBS}) +- link_threads(duckdb) +- link_extension_libraries(duckdb) ++ target_link_libraries(duckdb_pg_duckdb ${DUCKDB_LINK_LIBS}) ++ link_threads(duckdb_pg_duckdb) ++ link_extension_libraries(duckdb_pg_duckdb) ++ target_link_options(duckdb_pg_duckdb PRIVATE ++ -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb_pg_duckdb.map) + + add_library(duckdb_static STATIC ${ALL_OBJECT_FILES}) + target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS}) +@@ -141,7 +143,7 @@ else() + link_extension_libraries(duckdb_static) + + target_include_directories( +- duckdb PUBLIC $ ++ duckdb_pg_duckdb PUBLIC $ + $) + + target_include_directories( +@@ -161,7 +163,7 @@ else() + endif() + + install( +- TARGETS duckdb duckdb_static ++ TARGETS duckdb_pg_duckdb duckdb_static + EXPORT "${DUCKDB_EXPORT_SET}" + LIBRARY DESTINATION "${INSTALL_LIB_DIR}" + ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch index a7e188d69e..edc7fbf69d 100644 --- a/compute/patches/pg_duckdb_v031.patch +++ b/compute/patches/pg_duckdb_v031.patch @@ -1,3 +1,25 @@ +diff --git a/Makefile b/Makefile +index 3235cc8..6b892bc 100644 +--- a/Makefile ++++ b/Makefile +@@ -32,7 +32,7 @@ else + DUCKDB_BUILD_TYPE = release + endif + +-DUCKDB_LIB = libduckdb$(DLSUFFIX) ++DUCKDB_LIB = libduckdb_pg_duckdb$(DLSUFFIX) + FULL_DUCKDB_LIB = third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src/$(DUCKDB_LIB) + + ERROR_ON_WARNING ?= +@@ -54,7 +54,7 @@ override PG_CXXFLAGS += -std=c++17 ${DUCKDB_BUILD_CXX_FLAGS} ${COMPILER_FLAGS} - + # changes to the vendored code in one place. + override PG_CFLAGS += -Wno-declaration-after-statement + +-SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb -lstdc++ -llz4 ++SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb_pg_duckdb -lstdc++ -llz4 + + include Makefile.global + diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql index d777d76..af60106 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 568f0b0444..f63aa88da2 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -39,17 +39,33 @@ commands: user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' + # Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also. + # We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to + # use a different path for the socket. The symlink actually points to our custom path. + - name: rsyslogd-socket-symlink + user: root + sysvInitAction: sysinit + shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log" + - name: rsyslogd + user: postgres + sysvInitAction: respawn + shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: - filename: compute_ctl-sudoers content: | + # Reverse hostname lookup doesn't currently work, and isn't needed anyway when all + # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to + # resolve host" log messages that they generate. + Defaults !fqdn + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) # # Also allow it to shut down the VM. The fast_import job does that when it's finished. - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -64,6 +80,15 @@ files: } memory {} } +# Create dummy rsyslog config, because it refuses to start without at least one action configured. +# compute_ctl will rewrite this file with the actual configuration, if needed. + - filename: compute_rsyslog.conf + content: | + # Syslock.Name specifies a non-default pipe location that is writeable for the postgres user. + module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging + + *.* /dev/null + $IncludeConfig /etc/rsyslog.d/*.conf build: | # Build cgroup-tools # @@ -127,6 +152,12 @@ merge: | RUN set -e \ && chmod 0644 /etc/cgconfig.conf + + COPY compute_rsyslog.conf /etc/compute_rsyslog.conf + RUN chmod 0666 /etc/compute_rsyslog.conf + RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog + + COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 6617c98599..8b3c681228 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -39,17 +39,33 @@ commands: user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' + # Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also. + # We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to + # use a different path for the socket. The symlink actually points to our custom path. + - name: rsyslogd-socket-symlink + user: root + sysvInitAction: sysinit + shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log" + - name: rsyslogd + user: postgres + sysvInitAction: respawn + shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: - filename: compute_ctl-sudoers content: | + # Reverse hostname lookup doesn't currently work, and isn't needed anyway when all + # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to + # resolve host" log messages that they generate. + Defaults !fqdn + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) # # Also allow it to shut down the VM. The fast_import job does that when it's finished. - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -64,6 +80,15 @@ files: } memory {} } +# Create dummy rsyslog config, because it refuses to start without at least one action configured. +# compute_ctl will rewrite this file with the actual configuration, if needed. + - filename: compute_rsyslog.conf + content: | + # Syslock.Name specifies a non-default pipe location that is writeable for the postgres user. + module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging + + *.* /dev/null + $IncludeConfig /etc/rsyslog.d/*.conf build: | # Build cgroup-tools # @@ -123,6 +148,11 @@ merge: | RUN set -e \ && chmod 0644 /etc/cgconfig.conf + COPY compute_rsyslog.conf /etc/compute_rsyslog.conf + RUN chmod 0666 /etc/compute_rsyslog.conf + RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog + + COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index c276996df5..90951e7ddb 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "compute_tools" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] @@ -17,6 +17,7 @@ aws-sdk-kms.workspace = true aws-smithy-types.workspace = true anyhow.workspace = true axum = { workspace = true, features = [] } +axum-extra.workspace = true camino.workspace = true chrono.workspace = true cfg-if.workspace = true @@ -25,6 +26,8 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +indexmap.workspace = true +jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true notify.workspace = true @@ -32,27 +35,33 @@ num_cpus.workspace = true once_cell.workspace = true opentelemetry.workspace = true opentelemetry_sdk.workspace = true +p256 = { version = "0.13", features = ["pem"] } postgres.workspace = true regex.workspace = true +reqwest = { workspace = true, features = ["json"] } +ring = "0.17" serde.workspace = true serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true +spki = { version = "0.7.3", features = ["std"] } tar.workspace = true tower.workspace = true tower-http.workspace = true -reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true +tower-otel.workspace = true tracing.workspace = true +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true url.workspace = true uuid.workspace = true walkdir.workspace = true +x509-cert = { version = "0.2.5" } postgres_initdb.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 1cdae718fe..fc7a3e2827 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -33,41 +33,28 @@ //! -b /usr/local/bin/postgres \ //! -r http://pg-ext-s3-gateway \ //! ``` -use std::collections::HashMap; use std::ffi::OsString; use std::fs::File; use std::path::Path; use std::process::exit; -use std::str::FromStr; -use std::sync::atomic::Ordering; -use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; -use std::{thread, time::Duration}; +use std::sync::mpsc; +use std::thread; +use std::time::Duration; use anyhow::{Context, Result}; -use chrono::Utc; use clap::Parser; -use compute_tools::disk_quota::set_disk_quota; -use compute_tools::http::server::Server; -use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; -use signal_hook::consts::{SIGQUIT, SIGTERM}; -use signal_hook::{consts::SIGINT, iterator::Signals}; -use tracing::{error, info, warn}; -use url::Url; - -use compute_api::responses::{ComputeCtlConfig, ComputeStatus}; +use compute_api::responses::ComputeCtlConfig; use compute_api::spec::ComputeSpec; - -use compute_tools::compute::{ - forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID, -}; -use compute_tools::configurator::launch_configurator; +use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal}; use compute_tools::extension_server::get_pg_version_string; use compute_tools::logger::*; -use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; -use compute_tools::swap::resize_swap; -use rlimit::{setrlimit, Resource}; +use rlimit::{Resource, setrlimit}; +use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM}; +use signal_hook::iterator::Signals; +use tracing::{error, info}; +use url::Url; use utils::failpoint_support; // this is an arbitrary build tag. Fine as a default / for testing purposes @@ -149,6 +136,8 @@ struct Cli { fn main() -> Result<()> { let cli = Cli::parse(); + let scenario = failpoint_support::init(); + // For historical reasons, the main thread that processes the spec and launches postgres // is synchronous, but we always have this tokio runtime available and we "enter" it so // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...) @@ -160,34 +149,44 @@ fn main() -> Result<()> { let build_tag = runtime.block_on(init())?; - let scenario = failpoint_support::init(); - // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; - let (pg_handle, start_pg_result) = { - // Enter startup tracing context - let _startup_context_guard = startup_context_from_env(); + let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; - let cli_spec = try_spec_from_cli(&cli)?; + let cli_spec = try_spec_from_cli(&cli)?; - let compute = wait_spec(build_tag, &cli, cli_spec)?; + let compute_node = ComputeNode::new( + ComputeNodeParams { + compute_id: cli.compute_id, + connstr, + pgdata: cli.pgdata.clone(), + pgbin: cli.pgbin.clone(), + pgversion: get_pg_version_string(&cli.pgbin), + external_http_port: cli.external_http_port, + internal_http_port: cli.internal_http_port, + ext_remote_storage: cli.remote_ext_config.clone(), + resize_swap_on_bind: cli.resize_swap_on_bind, + set_disk_quota_for_fs: cli.set_disk_quota_for_fs, + #[cfg(target_os = "linux")] + filecache_connstr: cli.filecache_connstr, + #[cfg(target_os = "linux")] + cgroup: cli.cgroup, + #[cfg(target_os = "linux")] + vm_monitor_addr: cli.vm_monitor_addr, + build_tag, - start_postgres(&cli, compute)? + live_config_allowed: cli_spec.live_config_allowed, + }, + cli_spec.spec, + cli_spec.compute_ctl_config, + )?; - // Startup is finished, exit the startup tracing span - }; - - // PostgreSQL is now running, if startup was successful. Wait until it exits. - let wait_pg_result = wait_postgres(pg_handle)?; - - let delay_exit = cleanup_after_postgres_exit(start_pg_result)?; - - maybe_delay_exit(delay_exit); + let exit_code = compute_node.run()?; scenario.teardown(); - deinit_and_exit(wait_pg_result); + deinit_and_exit(exit_code); } async fn init() -> Result { @@ -208,56 +207,6 @@ async fn init() -> Result { Ok(build_tag) } -fn startup_context_from_env() -> Option { - // Extract OpenTelemetry context for the startup actions from the - // TRACEPARENT and TRACESTATE env variables, and attach it to the current - // tracing context. - // - // This is used to propagate the context for the 'start_compute' operation - // from the neon control plane. This allows linking together the wider - // 'start_compute' operation that creates the compute container, with the - // startup actions here within the container. - // - // There is no standard for passing context in env variables, but a lot of - // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See - // https://github.com/open-telemetry/opentelemetry-specification/issues/740 - // - // Switch to the startup context here, and exit it once the startup has - // completed and Postgres is up and running. - // - // If this pod is pre-created without binding it to any particular endpoint - // yet, this isn't the right place to enter the startup context. In that - // case, the control plane should pass the tracing context as part of the - // /configure API call. - // - // NOTE: This is supposed to only cover the *startup* actions. Once - // postgres is configured and up-and-running, we exit this span. Any other - // actions that are performed on incoming HTTP requests, for example, are - // performed in separate spans. - // - // XXX: If the pod is restarted, we perform the startup actions in the same - // context as the original startup actions, which probably doesn't make - // sense. - let mut startup_tracing_carrier: HashMap = HashMap::new(); - if let Ok(val) = std::env::var("TRACEPARENT") { - startup_tracing_carrier.insert("traceparent".to_string(), val); - } - if let Ok(val) = std::env::var("TRACESTATE") { - startup_tracing_carrier.insert("tracestate".to_string(), val); - } - if !startup_tracing_carrier.is_empty() { - use opentelemetry::propagation::TextMapPropagator; - use opentelemetry_sdk::propagation::TraceContextPropagator; - let guard = TraceContextPropagator::new() - .extract(&startup_tracing_carrier) - .attach(); - info!("startup tracing context attached"); - Some(guard) - } else { - None - } -} - fn try_spec_from_cli(cli: &Cli) -> Result { // First, try to get cluster spec from the cli argument if let Some(ref spec_json) = cli.spec_json { @@ -308,342 +257,7 @@ struct CliSpecParams { live_config_allowed: bool, } -fn wait_spec( - build_tag: String, - cli: &Cli, - CliSpecParams { - spec, - live_config_allowed, - compute_ctl_config: _, - }: CliSpecParams, -) -> Result> { - let mut new_state = ComputeState::new(); - let spec_set; - - if let Some(spec) = spec { - let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; - info!("new pspec.spec: {:?}", pspec.spec); - new_state.pspec = Some(pspec); - spec_set = true; - } else { - spec_set = false; - } - let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; - let conn_conf = postgres::config::Config::from_str(connstr.as_str()) - .context("cannot build postgres config from connstr")?; - let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str()) - .context("cannot build tokio postgres config from connstr")?; - let compute_node = ComputeNode { - compute_id: cli.compute_id.clone(), - connstr, - conn_conf, - tokio_conn_conf, - pgdata: cli.pgdata.clone(), - pgbin: cli.pgbin.clone(), - pgversion: get_pg_version_string(&cli.pgbin), - external_http_port: cli.external_http_port, - internal_http_port: cli.internal_http_port, - live_config_allowed, - state: Mutex::new(new_state), - state_changed: Condvar::new(), - ext_remote_storage: cli.remote_ext_config.clone(), - ext_download_progress: RwLock::new(HashMap::new()), - build_tag, - }; - let compute = Arc::new(compute_node); - - // If this is a pooled VM, prewarm before starting HTTP server and becoming - // available for binding. Prewarming helps Postgres start quicker later, - // because QEMU will already have its memory allocated from the host, and - // the necessary binaries will already be cached. - if !spec_set { - compute.prewarm_postgres()?; - } - - // Launch the external HTTP server first, so that we can serve control plane - // requests while configuration is still in progress. - Server::External(cli.external_http_port).launch(&compute); - - // The internal HTTP server could be launched later, but there isn't much - // sense in waiting. - Server::Internal(cli.internal_http_port).launch(&compute); - - if !spec_set { - // No spec provided, hang waiting for it. - info!("no compute spec provided, waiting"); - - let mut state = compute.state.lock().unwrap(); - while state.status != ComputeStatus::ConfigurationPending { - state = compute.state_changed.wait(state).unwrap(); - - if state.status == ComputeStatus::ConfigurationPending { - info!("got spec, continue configuration"); - // Spec is already set by the http server handler. - break; - } - } - - // Record for how long we slept waiting for the spec. - let now = Utc::now(); - state.metrics.wait_for_spec_ms = now - .signed_duration_since(state.start_time) - .to_std() - .unwrap() - .as_millis() as u64; - - // Reset start time, so that the total startup time that is calculated later will - // not include the time that we waited for the spec. - state.start_time = now; - } - - launch_lsn_lease_bg_task_for_static(&compute); - - Ok(compute) -} - -fn start_postgres( - cli: &Cli, - compute: Arc, -) -> Result<(Option, StartPostgresResult)> { - // We got all we need, update the state. - let mut state = compute.state.lock().unwrap(); - state.set_status(ComputeStatus::Init, &compute.state_changed); - - info!( - "running compute with features: {:?}", - state.pspec.as_ref().unwrap().spec.features - ); - // before we release the mutex, fetch some parameters for later. - let &ComputeSpec { - swap_size_bytes, - disk_quota_bytes, - #[cfg(target_os = "linux")] - disable_lfc_resizing, - .. - } = &state.pspec.as_ref().unwrap().spec; - drop(state); - - // Launch remaining service threads - let _monitor_handle = launch_monitor(&compute); - let _configurator_handle = launch_configurator(&compute); - - let mut prestartup_failed = false; - let mut delay_exit = false; - - // Resize swap to the desired size if the compute spec says so - if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) { - // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion - // *before* starting postgres. - // - // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this - // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets - // OOM-killed during startup because swap wasn't available yet. - match resize_swap(size_bytes) { - Ok(()) => { - let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. - info!(%size_bytes, %size_mib, "resized swap"); - } - Err(err) => { - let err = err.context("failed to resize swap"); - error!("{err:#}"); - - // Mark compute startup as failed; don't try to start postgres, and report this - // error to the control plane when it next asks. - prestartup_failed = true; - compute.set_failed_status(err); - delay_exit = true; - } - } - } - - // Set disk quota if the compute spec says so - if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = - (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref()) - { - match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) { - Ok(()) => { - let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. - info!(%disk_quota_bytes, %size_mib, "set disk quota"); - } - Err(err) => { - let err = err.context("failed to set disk quota"); - error!("{err:#}"); - - // Mark compute startup as failed; don't try to start postgres, and report this - // error to the control plane when it next asks. - prestartup_failed = true; - compute.set_failed_status(err); - delay_exit = true; - } - } - } - - // Start Postgres - let mut pg = None; - if !prestartup_failed { - pg = match compute.start_compute() { - Ok(pg) => { - info!(postmaster_pid = %pg.0.id(), "Postgres was started"); - Some(pg) - } - Err(err) => { - error!("could not start the compute node: {:#}", err); - compute.set_failed_status(err); - delay_exit = true; - None - } - }; - } else { - warn!("skipping postgres startup because pre-startup step failed"); - } - - // Start the vm-monitor if directed to. The vm-monitor only runs on linux - // because it requires cgroups. - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - use std::env; - use tokio_util::sync::CancellationToken; - - // This token is used internally by the monitor to clean up all threads - let token = CancellationToken::new(); - - // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC - let pgconnstr = if disable_lfc_resizing.unwrap_or(false) { - None - } else { - Some(cli.filecache_connstr.clone()) - }; - - let vm_monitor = if env::var_os("AUTOSCALING").is_some() { - let vm_monitor = tokio::spawn(vm_monitor::start( - Box::leak(Box::new(vm_monitor::Args { - cgroup: Some(cli.cgroup.clone()), - pgconnstr, - addr: cli.vm_monitor_addr.clone(), - })), - token.clone(), - )); - Some(vm_monitor) - } else { - None - }; - } - } - - Ok(( - pg, - StartPostgresResult { - delay_exit, - compute, - #[cfg(target_os = "linux")] - token, - #[cfg(target_os = "linux")] - vm_monitor, - }, - )) -} - -type PostgresHandle = (std::process::Child, tokio::task::JoinHandle>); - -struct StartPostgresResult { - delay_exit: bool, - // passed through from WaitSpecResult - compute: Arc, - - #[cfg(target_os = "linux")] - token: tokio_util::sync::CancellationToken, - #[cfg(target_os = "linux")] - vm_monitor: Option>>, -} - -fn wait_postgres(pg: Option) -> Result { - // Wait for the child Postgres process forever. In this state Ctrl+C will - // propagate to Postgres and it will be shut down as well. - let mut exit_code = None; - if let Some((mut pg, logs_handle)) = pg { - info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit"); - - let ecode = pg - .wait() - .expect("failed to start waiting on Postgres process"); - PG_PID.store(0, Ordering::SeqCst); - - // Process has exited. Wait for the log collecting task to finish. - let _ = tokio::runtime::Handle::current() - .block_on(logs_handle) - .map_err(|e| tracing::error!("log task panicked: {:?}", e)); - - info!("Postgres exited with code {}, shutting down", ecode); - exit_code = ecode.code() - } - - Ok(WaitPostgresResult { exit_code }) -} - -struct WaitPostgresResult { - exit_code: Option, -} - -fn cleanup_after_postgres_exit( - StartPostgresResult { - mut delay_exit, - compute, - #[cfg(target_os = "linux")] - vm_monitor, - #[cfg(target_os = "linux")] - token, - }: StartPostgresResult, -) -> Result { - // Terminate the vm_monitor so it releases the file watcher on - // /sys/fs/cgroup/neon-postgres. - // Note: the vm-monitor only runs on linux because it requires cgroups. - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - if let Some(handle) = vm_monitor { - // Kills all threads spawned by the monitor - token.cancel(); - // Kills the actual task running the monitor - handle.abort(); - } - } - } - - // Maybe sync safekeepers again, to speed up next startup - let compute_state = compute.state.lock().unwrap().clone(); - let pspec = compute_state.pspec.as_ref().expect("spec must be set"); - if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) { - info!("syncing safekeepers on shutdown"); - let storage_auth_token = pspec.storage_auth_token.clone(); - let lsn = compute.sync_safekeepers(storage_auth_token)?; - info!("synced safekeepers at lsn {lsn}"); - } - - let mut state = compute.state.lock().unwrap(); - if state.status == ComputeStatus::TerminationPending { - state.status = ComputeStatus::Terminated; - compute.state_changed.notify_all(); - // we were asked to terminate gracefully, don't exit to avoid restart - delay_exit = true - } - drop(state); - - if let Err(err) = compute.check_for_core_dumps() { - error!("error while checking for core dumps: {err:?}"); - } - - Ok(delay_exit) -} - -fn maybe_delay_exit(delay_exit: bool) { - // If launch failed, keep serving HTTP requests for a while, so the cloud - // control plane can get the actual error. - if delay_exit { - info!("giving control plane 30s to collect the error before shutdown"); - thread::sleep(Duration::from_secs(30)); - } -} - -fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! { +fn deinit_and_exit(exit_code: Option) -> ! { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. Shutting down OTEL tracing provider may // hang for quite some time, see, for example: diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 585f3e4e1d..47558be7a0 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -25,13 +25,13 @@ //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; -use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion}; +use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version}; use nix::unistd::Pid; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{Instrument, error, info, info_span, warn}; use utils::fs_ext::is_directory_empty; #[path = "fast_import/aws_s3_sync.rs"] @@ -558,7 +558,9 @@ async fn cmd_dumprestore( decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext) .await? } else { - bail!("destination connection string must be provided in spec for dump_restore command"); + bail!( + "destination connection string must be provided in spec for dump_restore command" + ); }; (source, dest) diff --git a/compute_tools/src/bin/fast_import/aws_s3_sync.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs index 1be10b36d6..d8d007da71 100644 --- a/compute_tools/src/bin/fast_import/aws_s3_sync.rs +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -1,11 +1,10 @@ use camino::{Utf8Path, Utf8PathBuf}; use tokio::task::JoinSet; +use tracing::{info, warn}; use walkdir::WalkDir; use super::s3_uri::S3Uri; -use tracing::{info, warn}; - const MAX_PARALLEL_UPLOADS: usize = 10; /// Upload all files from 'local' to 'remote' diff --git a/compute_tools/src/bin/fast_import/s3_uri.rs b/compute_tools/src/bin/fast_import/s3_uri.rs index 52bbef420f..cf4dab7c02 100644 --- a/compute_tools/src/bin/fast_import/s3_uri.rs +++ b/compute_tools/src/bin/fast_import/s3_uri.rs @@ -1,6 +1,7 @@ -use anyhow::Result; use std::str::FromStr; +use anyhow::Result; + /// Struct to hold parsed S3 components #[derive(Debug, Clone, PartialEq, Eq)] pub struct S3Uri { diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 28b10ce21c..db3e07e086 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -1,18 +1,20 @@ +use std::path::Path; +use std::process::Stdio; +use std::result::Result; +use std::sync::Arc; + +use compute_api::responses::CatalogObjects; use futures::Stream; use postgres::NoTls; -use std::{path::Path, process::Stdio, result::Result, sync::Arc}; -use tokio::{ - io::{AsyncBufReadExt, BufReader}, - process::Command, - spawn, -}; +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::process::Command; +use tokio::spawn; use tokio_stream::{self as stream, StreamExt}; use tokio_util::codec::{BytesCodec, FramedRead}; use tracing::warn; use crate::compute::ComputeNode; use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db}; -use compute_api::responses::CatalogObjects; pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles")); @@ -55,15 +57,15 @@ pub enum SchemaDumpError { pub async fn get_database_schema( compute: &Arc, dbname: &str, -) -> Result>, SchemaDumpError> { - let pgbin = &compute.pgbin; +) -> Result> + use<>, SchemaDumpError> { + let pgbin = &compute.params.pgbin; let basepath = Path::new(pgbin).parent().unwrap(); let pgdump = basepath.join("pg_dump"); // Replace the DB in the connection string and disable it to parts. // This is the only option to handle DBs with special characters. - let conf = - postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?; + let conf = postgres_conf_for_db(&compute.params.connstr, dbname) + .map_err(|_| SchemaDumpError::Unexpected)?; let host = conf .get_hosts() .first() diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index 62d61a8bc9..e4207876ac 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, Ok, Result}; +use anyhow::{Ok, Result, anyhow}; use tokio_postgres::NoTls; use tracing::{error, instrument, warn}; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a3ea3a147f..d31472b0c1 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,74 +1,87 @@ -use std::collections::{HashMap, HashSet}; -use std::env; -use std::fs; -use std::iter::once; -use std::os::unix::fs::{symlink, PermissionsExt}; +use std::collections::HashMap; +use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::atomic::AtomicU32; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; -use std::time::Duration; -use std::time::Instant; +use std::time::{Duration, Instant}; +use std::{env, fs}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; -use compute_api::spec::{Database, PgIdent, Role}; +use compute_api::privilege::Privilege; +use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus}; +use compute_api::spec::{ + ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, +}; +use futures::StreamExt; use futures::future::join_all; use futures::stream::FuturesUnordered; -use futures::StreamExt; +use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use postgres; -use postgres::error::SqlState; use postgres::NoTls; -use tracing::{debug, error, info, instrument, warn}; -use utils::id::{TenantId, TimelineId}; -use utils::lsn::Lsn; - -use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeMetrics, ComputeStatus}; -use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion}; -use utils::measured_stream::MeasuredReader; - -use nix::sys::signal::{kill, Signal}; +use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; use tokio::spawn; +use tracing::{Instrument, debug, error, info, instrument, warn}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::measured_stream::MeasuredReader; +use crate::configurator::launch_configurator; +use crate::disk_quota::set_disk_quota; use crate::installed_extensions::get_installed_extensions; -use crate::local_proxy; +use crate::logger::startup_context_from_env; +use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; +use crate::monitor::launch_monitor; use crate::pg_helpers::*; +use crate::rsyslog::{ + PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export, + launch_pgaudit_gc, +}; use crate::spec::*; -use crate::spec_apply::ApplySpecPhase::{ - CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon, - CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, - HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, - RunInEachDatabase, -}; -use crate::spec_apply::PerDatabasePhase; -use crate::spec_apply::PerDatabasePhase::{ - ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, -}; -use crate::spec_apply::{apply_operations, MutableApplyContext, DB}; +use crate::swap::resize_swap; use crate::sync_sk::{check_if_synced, ping_safekeeper}; -use crate::{config, extension_server}; +use crate::tls::watch_cert_for_changes; +use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); pub static PG_PID: AtomicU32 = AtomicU32::new(0); -/// Compute node info shared across several `compute_ctl` threads. -pub struct ComputeNode { +/// Static configuration params that don't change after startup. These mostly +/// come from the CLI args, or are derived from them. +pub struct ComputeNodeParams { /// The ID of the compute pub compute_id: String, // Url type maintains proper escaping pub connstr: url::Url, - // We connect to Postgres from many different places, so build configs once - // and reuse them where needed. - pub conn_conf: postgres::config::Config, - pub tokio_conn_conf: tokio_postgres::config::Config, + + pub resize_swap_on_bind: bool, + pub set_disk_quota_for_fs: Option, + + // VM monitor parameters + #[cfg(target_os = "linux")] + pub filecache_connstr: String, + #[cfg(target_os = "linux")] + pub cgroup: String, + #[cfg(target_os = "linux")] + pub vm_monitor_addr: String, + pub pgdata: String, pub pgbin: String, pub pgversion: String, + pub build_tag: String, + + /// The port that the compute's external HTTP server listens on + pub external_http_port: u16, + /// The port that the compute's internal HTTP server listens on + pub internal_http_port: u16, + + /// the address of extension storage proxy gateway + pub ext_remote_storage: Option, + /// We should only allow live re- / configuration of the compute node if /// it uses 'pull model', i.e. it can go to control-plane and fetch /// the latest configuration. Otherwise, there could be a case: @@ -82,10 +95,17 @@ pub struct ComputeNode { /// - we push spec and it does configuration /// - but then it is restarted without any spec again pub live_config_allowed: bool, - /// The port that the compute's external HTTP server listens on - pub external_http_port: u16, - /// The port that the compute's internal HTTP server listens on - pub internal_http_port: u16, +} + +/// Compute node info shared across several `compute_ctl` threads. +pub struct ComputeNode { + pub params: ComputeNodeParams, + + // We connect to Postgres from many different places, so build configs once + // and reuse them where needed. These are derived from 'params.connstr' + pub conn_conf: postgres::config::Config, + pub tokio_conn_conf: tokio_postgres::config::Config, + /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. /// To allow HTTP API server to serving status requests, while configuration /// is in progress, lock should be held only for short periods of time to do @@ -93,11 +113,10 @@ pub struct ComputeNode { pub state: Mutex, /// `Condvar` to allow notifying waiters about state changes. pub state_changed: Condvar, - /// the address of extension storage proxy gateway - pub ext_remote_storage: Option, + // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, - pub build_tag: String, + pub compute_ctl_config: ComputeCtlConfig, } // store some metrics about download size that might impact startup time @@ -116,7 +135,23 @@ pub struct ComputeState { /// compute wasn't used since start. pub last_active: Option>, pub error: Option, + + /// Compute spec. This can be received from the CLI or - more likely - + /// passed by the control plane with a /configure HTTP request. pub pspec: Option, + + /// If the spec is passed by a /configure request, 'startup_span' is the + /// /configure request's tracing span. The main thread enters it when it + /// processes the compute startup, so that the compute startup is considered + /// to be part of the /configure request for tracing purposes. + /// + /// If the request handling thread/task called startup_compute() directly, + /// it would automatically be a child of the request handling span, and we + /// wouldn't need this. But because we use the main thread to perform the + /// startup, and the /configure task just waits for it to finish, we need to + /// set up the span relationship ourselves. + pub startup_span: Option, + pub metrics: ComputeMetrics, } @@ -128,6 +163,7 @@ impl ComputeState { last_active: None, error: None, pspec: None, + startup_span: None, metrics: ComputeMetrics::default(), } } @@ -244,80 +280,539 @@ fn maybe_cgexec(cmd: &str) -> Command { } } -pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String { - let roles = spec - .cluster - .roles - .iter() - .map(|r| escape_literal(&r.name)) - .collect::>(); +struct PostgresHandle { + postgres: std::process::Child, + log_collector: tokio::task::JoinHandle>, +} - let dbs = spec - .cluster - .databases - .iter() - .map(|db| escape_literal(&db.name)) - .collect::>(); +impl PostgresHandle { + /// Return PID of the postgres (postmaster) process + fn pid(&self) -> Pid { + Pid::from_raw(self.postgres.id() as i32) + } +} - let roles_decl = if roles.is_empty() { - String::from("roles text[] := NULL;") - } else { - format!( - r#" - roles text[] := ARRAY(SELECT rolname - FROM pg_catalog.pg_roles - WHERE rolname IN ({}));"#, - roles.join(", ") - ) - }; - - let database_decl = if dbs.is_empty() { - String::from("dbs text[] := NULL;") - } else { - format!( - r#" - dbs text[] := ARRAY(SELECT datname - FROM pg_catalog.pg_database - WHERE datname IN ({}));"#, - dbs.join(", ") - ) - }; - - // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases - // (see https://www.postgresql.org/docs/current/ddl-priv.html) - let query = format!( - r#" - DO $$ - DECLARE - r text; - {} - {} - BEGIN - IF NOT EXISTS ( - SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser') - THEN - CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data; - IF array_length(roles, 1) IS NOT NULL THEN - EXECUTE format('GRANT neon_superuser TO %s', - array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', ')); - FOREACH r IN ARRAY roles LOOP - EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r)); - END LOOP; - END IF; - IF array_length(dbs, 1) IS NOT NULL THEN - EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser', - array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', ')); - END IF; - END IF; - END - $$;"#, - roles_decl, database_decl, - ); - - query +struct StartVmMonitorResult { + #[cfg(target_os = "linux")] + token: tokio_util::sync::CancellationToken, + #[cfg(target_os = "linux")] + vm_monitor: Option>>, } impl ComputeNode { + pub fn new( + params: ComputeNodeParams, + cli_spec: Option, + compute_ctl_config: ComputeCtlConfig, + ) -> Result { + let connstr = params.connstr.as_str(); + let conn_conf = postgres::config::Config::from_str(connstr) + .context("cannot build postgres config from connstr")?; + let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) + .context("cannot build tokio postgres config from connstr")?; + + let mut new_state = ComputeState::new(); + if let Some(cli_spec) = cli_spec { + let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?; + new_state.pspec = Some(pspec); + } + + Ok(ComputeNode { + params, + conn_conf, + tokio_conn_conf, + state: Mutex::new(new_state), + state_changed: Condvar::new(), + ext_download_progress: RwLock::new(HashMap::new()), + compute_ctl_config, + }) + } + + /// Top-level control flow of compute_ctl. Returns a process exit code we should + /// exit with. + pub fn run(self) -> Result> { + let this = Arc::new(self); + + let cli_spec = this.state.lock().unwrap().pspec.clone(); + + // If this is a pooled VM, prewarm before starting HTTP server and becoming + // available for binding. Prewarming helps Postgres start quicker later, + // because QEMU will already have its memory allocated from the host, and + // the necessary binaries will already be cached. + if cli_spec.is_none() { + this.prewarm_postgres()?; + } + + // Launch the external HTTP server first, so that we can serve control plane + // requests while configuration is still in progress. + crate::http::server::Server::External { + port: this.params.external_http_port, + config: this.compute_ctl_config.clone(), + compute_id: this.params.compute_id.clone(), + } + .launch(&this); + + // The internal HTTP server could be launched later, but there isn't much + // sense in waiting. + crate::http::server::Server::Internal { + port: this.params.internal_http_port, + } + .launch(&this); + + // If we got a spec from the CLI already, use that. Otherwise wait for the + // control plane to pass it to us with a /configure HTTP request + let pspec = if let Some(cli_spec) = cli_spec { + cli_spec + } else { + this.wait_spec()? + }; + + launch_lsn_lease_bg_task_for_static(&this); + + // We have a spec, start the compute + let mut delay_exit = false; + let mut vm_monitor = None; + let mut pg_process: Option = None; + + match this.start_compute(&mut pg_process) { + Ok(()) => { + // Success! Launch remaining services (just vm-monitor currently) + vm_monitor = + Some(this.start_vm_monitor(pspec.spec.disable_lfc_resizing.unwrap_or(false))); + } + Err(err) => { + // Something went wrong with the startup. Log it and expose the error to + // HTTP status requests. + error!("could not start the compute node: {:#}", err); + this.set_failed_status(err); + delay_exit = true; + + // If the error happened after starting PostgreSQL, kill it + if let Some(ref pg_process) = pg_process { + kill(pg_process.pid(), Signal::SIGQUIT).ok(); + } + } + } + + // If startup was successful, or it failed in the late stages, + // PostgreSQL is now running. Wait until it exits. + let exit_code = if let Some(pg_handle) = pg_process { + let exit_status = this.wait_postgres(pg_handle); + info!("Postgres exited with code {}, shutting down", exit_status); + exit_status.code() + } else { + None + }; + + // Terminate the vm_monitor so it releases the file watcher on + // /sys/fs/cgroup/neon-postgres. + // Note: the vm-monitor only runs on linux because it requires cgroups. + if let Some(vm_monitor) = vm_monitor { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + // Kills all threads spawned by the monitor + vm_monitor.token.cancel(); + if let Some(handle) = vm_monitor.vm_monitor { + // Kills the actual task running the monitor + handle.abort(); + } + } else { + _ = vm_monitor; // appease unused lint on macOS + } + } + } + + // Reap the postgres process + delay_exit |= this.cleanup_after_postgres_exit()?; + + // If launch failed, keep serving HTTP requests for a while, so the cloud + // control plane can get the actual error. + if delay_exit { + info!("giving control plane 30s to collect the error before shutdown"); + std::thread::sleep(Duration::from_secs(30)); + } + Ok(exit_code) + } + + pub fn wait_spec(&self) -> Result { + info!("no compute spec provided, waiting"); + let mut state = self.state.lock().unwrap(); + while state.status != ComputeStatus::ConfigurationPending { + state = self.state_changed.wait(state).unwrap(); + } + + info!("got spec, continue configuration"); + let spec = state.pspec.as_ref().unwrap().clone(); + + // Record for how long we slept waiting for the spec. + let now = Utc::now(); + state.metrics.wait_for_spec_ms = now + .signed_duration_since(state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + + // Reset start time, so that the total startup time that is calculated later will + // not include the time that we waited for the spec. + state.start_time = now; + + Ok(spec) + } + + /// Start compute. + /// + /// Prerequisites: + /// - the compute spec has been placed in self.state.pspec + /// + /// On success: + /// - status is set to ComputeStatus::Running + /// - self.running_postgres is set + /// + /// On error: + /// - status is left in ComputeStatus::Init. The caller is responsible for setting it to Failed + /// - if Postgres was started before the fatal error happened, self.running_postgres is + /// set. The caller is responsible for killing it. + /// + /// Note that this is in the critical path of a compute cold start. Keep this fast. + /// Try to do things concurrently, to hide the latencies. + fn start_compute(self: &Arc, pg_handle: &mut Option) -> Result<()> { + let compute_state: ComputeState; + + let start_compute_span; + let _this_entered; + { + let mut state_guard = self.state.lock().unwrap(); + + // Create a tracing span for the startup operation. + // + // We could otherwise just annotate the function with #[instrument], but if + // we're being configured from a /configure HTTP request, we want the + // startup to be considered part of the /configure request. + // + // Similarly, if a trace ID was passed in env variables, attach it to the span. + start_compute_span = { + // Temporarily enter the parent span, so that the new span becomes its child. + if let Some(p) = state_guard.startup_span.take() { + let _parent_entered = p.entered(); + tracing::info_span!("start_compute") + } else if let Some(otel_context) = startup_context_from_env() { + use tracing_opentelemetry::OpenTelemetrySpanExt; + let span = tracing::info_span!("start_compute"); + span.set_parent(otel_context); + span + } else { + tracing::info_span!("start_compute") + } + }; + _this_entered = start_compute_span.enter(); + + state_guard.set_status(ComputeStatus::Init, &self.state_changed); + compute_state = state_guard.clone() + } + + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + info!( + "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}", + pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), + pspec.spec.operation_uuid.as_deref().unwrap_or("None"), + pspec.tenant_id, + pspec.timeline_id, + pspec.spec.features, + pspec.spec.remote_extensions, + ); + + ////// PRE-STARTUP PHASE: things that need to be finished before we start the Postgres process + + // Collect all the tasks that must finish here + let mut pre_tasks = tokio::task::JoinSet::new(); + + // Make sure TLS certificates are properly loaded and in the right place. + if self.compute_ctl_config.tls.is_some() { + let this = self.clone(); + pre_tasks.spawn(async move { + this.watch_cert_for_changes().await; + + Ok::<(), anyhow::Error>(()) + }); + } + + // If there are any remote extensions in shared_preload_libraries, start downloading them + if pspec.spec.remote_extensions.is_some() { + let (this, spec) = (self.clone(), pspec.spec.clone()); + pre_tasks.spawn(async move { + this.download_preload_extensions(&spec) + .in_current_span() + .await + }); + } + + // Prepare pgdata directory. This downloads the basebackup, among other things. + { + let (this, cs) = (self.clone(), compute_state.clone()); + pre_tasks.spawn_blocking_child(move || this.prepare_pgdata(&cs)); + } + + // Resize swap to the desired size if the compute spec says so + if let (Some(size_bytes), true) = + (pspec.spec.swap_size_bytes, self.params.resize_swap_on_bind) + { + pre_tasks.spawn_blocking_child(move || { + // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion + // *before* starting postgres. + // + // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this + // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets + // OOM-killed during startup because swap wasn't available yet. + resize_swap(size_bytes).context("failed to resize swap")?; + let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%size_bytes, %size_mib, "resized swap"); + + Ok::<(), anyhow::Error>(()) + }); + } + + // Set disk quota if the compute spec says so + if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = ( + pspec.spec.disk_quota_bytes, + self.params.set_disk_quota_for_fs.as_ref(), + ) { + let disk_quota_fs_mountpoint = disk_quota_fs_mountpoint.clone(); + pre_tasks.spawn_blocking_child(move || { + set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) + .context("failed to set disk quota")?; + let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%disk_quota_bytes, %size_mib, "set disk quota"); + + Ok::<(), anyhow::Error>(()) + }); + } + + // tune pgbouncer + if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { + info!("tuning pgbouncer"); + + let pgbouncer_settings = pgbouncer_settings.clone(); + let tls_config = self.compute_ctl_config.tls.clone(); + + // Spawn a background task to do the tuning, + // so that we don't block the main thread that starts Postgres. + let _handle = tokio::spawn(async move { + let res = tune_pgbouncer(pgbouncer_settings, tls_config).await; + if let Err(err) = res { + error!("error while tuning pgbouncer: {err:?}"); + // Continue with the startup anyway + } + }); + } + + // configure local_proxy + if let Some(local_proxy) = &pspec.spec.local_proxy_config { + info!("configuring local_proxy"); + + // Spawn a background task to do the configuration, + // so that we don't block the main thread that starts Postgres. + let local_proxy = local_proxy.clone(); + let _handle = tokio::spawn(async move { + if let Err(err) = local_proxy::configure(&local_proxy) { + error!("error while configuring local_proxy: {err:?}"); + // Continue with the startup anyway + } + }); + } + + // Configure and start rsyslog for HIPAA if necessary + if let ComputeAudit::Hipaa = pspec.spec.audit_log_level { + let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); + if remote_endpoint.is_empty() { + anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + } + + let log_directory_path = Path::new(&self.params.pgdata).join("log"); + let log_directory_path = log_directory_path.to_string_lossy().to_string(); + configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; + + // Launch a background task to clean up the audit logs + launch_pgaudit_gc(log_directory_path); + } + + // Configure and start rsyslog for Postgres logs export + if self.has_feature(ComputeFeature::PostgresLogsExport) { + if let Some(ref project_id) = pspec.spec.cluster.cluster_id { + let host = PostgresLogsRsyslogConfig::default_host(project_id); + let conf = PostgresLogsRsyslogConfig::new(Some(&host)); + configure_postgres_logs_export(conf)?; + } else { + warn!("not configuring rsyslog for Postgres logs export: project ID is missing") + } + } + + // Launch remaining service threads + let _monitor_handle = launch_monitor(self); + let _configurator_handle = launch_configurator(self); + + // Wait for all the pre-tasks to finish before starting postgres + let rt = tokio::runtime::Handle::current(); + while let Some(res) = rt.block_on(pre_tasks.join_next()) { + res??; + } + + ////// START POSTGRES + let start_time = Utc::now(); + let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; + let postmaster_pid = pg_process.pid(); + *pg_handle = Some(pg_process); + + // If this is a primary endpoint, perform some post-startup configuration before + // opening it up for the world. + let config_time = Utc::now(); + if pspec.spec.mode == ComputeMode::Primary { + self.configure_as_primary(&compute_state)?; + + let conf = self.get_tokio_conn_conf(None); + tokio::task::spawn(async { + let res = get_installed_extensions(conf).await; + match res { + Ok(extensions) => { + info!( + "[NEON_EXT_STAT] {}", + serde_json::to_string(&extensions) + .expect("failed to serialize extensions list") + ); + } + Err(err) => error!("could not get installed extensions: {err:?}"), + } + }); + } + + // All done! + let startup_end_time = Utc::now(); + let metrics = { + let mut state = self.state.lock().unwrap(); + state.metrics.start_postgres_ms = config_time + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64; + state.metrics.config_ms = startup_end_time + .signed_duration_since(config_time) + .to_std() + .unwrap() + .as_millis() as u64; + state.metrics.total_startup_ms = startup_end_time + .signed_duration_since(compute_state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + state.metrics.clone() + }; + self.set_status(ComputeStatus::Running); + + // Log metrics so that we can search for slow operations in logs + info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished"); + + Ok(()) + } + + #[instrument(skip_all)] + async fn download_preload_extensions(&self, spec: &ComputeSpec) -> Result<()> { + let remote_extensions = if let Some(remote_extensions) = &spec.remote_extensions { + remote_extensions + } else { + return Ok(()); + }; + + // First, create control files for all available extensions + extension_server::create_control_files(remote_extensions, &self.params.pgbin); + + let library_load_start_time = Utc::now(); + let remote_ext_metrics = self.prepare_preload_libraries(spec).await?; + + let library_load_time = Utc::now() + .signed_duration_since(library_load_start_time) + .to_std() + .unwrap() + .as_millis() as u64; + let mut state = self.state.lock().unwrap(); + state.metrics.load_ext_ms = library_load_time; + state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded; + state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size; + state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size; + info!( + "Loading shared_preload_libraries took {:?}ms", + library_load_time + ); + info!("{:?}", remote_ext_metrics); + + Ok(()) + } + + /// Start the vm-monitor if directed to. The vm-monitor only runs on linux + /// because it requires cgroups. + fn start_vm_monitor(&self, disable_lfc_resizing: bool) -> StartVmMonitorResult { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + use std::env; + use tokio_util::sync::CancellationToken; + + // This token is used internally by the monitor to clean up all threads + let token = CancellationToken::new(); + + // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC + let pgconnstr = if disable_lfc_resizing { + None + } else { + Some(self.params.filecache_connstr.clone()) + }; + + let vm_monitor = if env::var_os("AUTOSCALING").is_some() { + let vm_monitor = tokio::spawn(vm_monitor::start( + Box::leak(Box::new(vm_monitor::Args { + cgroup: Some(self.params.cgroup.clone()), + pgconnstr, + addr: self.params.vm_monitor_addr.clone(), + })), + token.clone(), + )); + Some(vm_monitor) + } else { + None + }; + StartVmMonitorResult { token, vm_monitor } + } else { + _ = disable_lfc_resizing; // appease unused lint on macOS + StartVmMonitorResult { } + } + } + } + + fn cleanup_after_postgres_exit(&self) -> Result { + // Maybe sync safekeepers again, to speed up next startup + let compute_state = self.state.lock().unwrap().clone(); + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) { + info!("syncing safekeepers on shutdown"); + let storage_auth_token = pspec.storage_auth_token.clone(); + let lsn = self.sync_safekeepers(storage_auth_token)?; + info!("synced safekeepers at lsn {lsn}"); + } + + let mut delay_exit = false; + let mut state = self.state.lock().unwrap(); + if state.status == ComputeStatus::TerminationPending { + state.status = ComputeStatus::Terminated; + self.state_changed.notify_all(); + // we were asked to terminate gracefully, don't exit to avoid restart + delay_exit = true + } + drop(state); + + if let Err(err) = self.check_for_core_dumps() { + error!("error while checking for core dumps: {err:?}"); + } + + Ok(delay_exit) + } + /// Check that compute node has corresponding feature enabled. pub fn has_feature(&self, feature: ComputeFeature) -> bool { let state = self.state.lock().unwrap(); @@ -356,9 +851,10 @@ impl ComputeNode { fn create_pgdata(&self) -> Result<()> { // Ignore removal error, likely it is a 'No such file or directory (os error 2)'. // If it is something different then create_dir() will error out anyway. - let _ok = fs::remove_dir_all(&self.pgdata); - fs::create_dir(&self.pgdata)?; - fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?; + let pgdata = &self.params.pgdata; + let _ok = fs::remove_dir_all(pgdata); + fs::create_dir(pgdata)?; + fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?; Ok(()) } @@ -423,7 +919,7 @@ impl ComputeNode { // sends an Error after finishing the tarball, we will not notice it. let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader)); ar.set_ignore_zeros(true); - ar.unpack(&self.pgdata)?; + ar.unpack(&self.params.pgdata)?; // Report metrics let mut state = self.state.lock().unwrap(); @@ -546,6 +1042,7 @@ impl ComputeNode { // Fast path for sync_safekeepers. If they're already synced we get the lsn // in one roundtrip. If not, we should do a full sync_safekeepers. + #[instrument(skip_all)] pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result> { let start_time = Utc::now(); @@ -567,9 +1064,9 @@ impl ComputeNode { pub fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); - let mut sync_handle = maybe_cgexec(&self.pgbin) + let mut sync_handle = maybe_cgexec(&self.params.pgbin) .args(["--sync-safekeepers"]) - .env("PGDATA", &self.pgdata) // we cannot use -D in this mode + .env("PGDATA", &self.params.pgdata) // we cannot use -D in this mode .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { @@ -626,14 +1123,15 @@ impl ComputeNode { pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = &pspec.spec; - let pgdata_path = Path::new(&self.pgdata); + let pgdata_path = Path::new(&self.params.pgdata); // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf( - &pgdata_path.join("postgresql.conf"), + pgdata_path, &pspec.spec, - self.internal_http_port, + self.params.internal_http_port, + &self.compute_ctl_config.tls, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -733,12 +1231,15 @@ impl ComputeNode { info!("prewarming"); // Create pgdata - let pgdata = &format!("{}.warmup", self.pgdata); + let pgdata = &format!("{}.warmup", self.params.pgdata); create_pgdata(pgdata)?; // Run initdb to completion info!("running initdb"); - let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb"); + let initdb_bin = Path::new(&self.params.pgbin) + .parent() + .unwrap() + .join("initdb"); Command::new(initdb_bin) .args(["--pgdata", pgdata]) .output() @@ -754,7 +1255,7 @@ impl ComputeNode { // Start postgres info!("starting postgres"); - let mut pg = maybe_cgexec(&self.pgbin) + let mut pg = maybe_cgexec(&self.params.pgbin) .args(["-D", pgdata]) .spawn() .expect("cannot start postgres process"); @@ -776,19 +1277,17 @@ impl ComputeNode { Ok(()) } - /// Start Postgres as a child process and manage DBs/roles. - /// After that this will hang waiting on the postmaster process to exit. + /// Start Postgres as a child process and wait for it to start accepting + /// connections. + /// /// Returns a handle to the child process and a handle to the logs thread. #[instrument(skip_all)] - pub fn start_postgres( - &self, - storage_auth_token: Option, - ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { - let pgdata_path = Path::new(&self.pgdata); + pub fn start_postgres(&self, storage_auth_token: Option) -> Result { + let pgdata_path = Path::new(&self.params.pgdata); // Run postgres as a child process. - let mut pg = maybe_cgexec(&self.pgbin) - .args(["-D", &self.pgdata]) + let mut pg = maybe_cgexec(&self.params.pgbin) + .args(["-D", &self.params.pgdata]) .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { @@ -805,7 +1304,29 @@ impl ComputeNode { wait_for_postgres(&mut pg, pgdata_path)?; - Ok((pg, logs_handle)) + Ok(PostgresHandle { + postgres: pg, + log_collector: logs_handle, + }) + } + + /// Wait for the child Postgres process forever. In this state Ctrl+C will + /// propagate to Postgres and it will be shut down as well. + fn wait_postgres(&self, mut pg_handle: PostgresHandle) -> std::process::ExitStatus { + info!(postmaster_pid = %pg_handle.postgres.id(), "Waiting for Postgres to exit"); + + let ecode = pg_handle + .postgres + .wait() + .expect("failed to start waiting on Postgres process"); + PG_PID.store(0, Ordering::SeqCst); + + // Process has exited. Wait for the log collecting task to finish. + let _ = tokio::runtime::Handle::current() + .block_on(pg_handle.log_collector) + .map_err(|e| tracing::error!("log task panicked: {:?}", e)); + + ecode } /// Do post configuration of the already started Postgres. This function spawns a background task to @@ -915,387 +1436,6 @@ impl ComputeNode { Ok(client) } - /// Apply the spec to the running PostgreSQL instance. - /// The caller can decide to run with multiple clients in parallel, or - /// single mode. Either way, the commands executed will be the same, and - /// only commands run in different databases are parallelized. - #[instrument(skip_all)] - pub fn apply_spec_sql( - &self, - spec: Arc, - conf: Arc, - concurrency: usize, - ) -> Result<()> { - info!("Applying config with max {} concurrency", concurrency); - debug!("Config: {:?}", spec); - - let rt = tokio::runtime::Handle::current(); - rt.block_on(async { - // Proceed with post-startup configuration. Note, that order of operations is important. - let client = Self::get_maintenance_client(&conf).await?; - let spec = spec.clone(); - - let databases = get_existing_dbs_async(&client).await?; - let roles = get_existing_roles_async(&client) - .await? - .into_iter() - .map(|role| (role.name.clone(), role)) - .collect::>(); - - // Check if we need to drop subscriptions before starting the endpoint. - // - // It is important to do this operation exactly once when endpoint starts on a new branch. - // Otherwise, we may drop not inherited, but newly created subscriptions. - // - // We cannot rely only on spec.drop_subscriptions_before_start flag, - // because if for some reason compute restarts inside VM, - // it will start again with the same spec and flag value. - // - // To handle this, we save the fact of the operation in the database - // in the neon.drop_subscriptions_done table. - // If the table does not exist, we assume that the operation was never performed, so we must do it. - // If table exists, we check if the operation was performed on the current timelilne. - // - let mut drop_subscriptions_done = false; - - if spec.drop_subscriptions_before_start { - let timeline_id = self.get_timeline_id().context("timeline_id must be set")?; - let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id); - - info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); - - drop_subscriptions_done = match - client.simple_query(&query).await { - Ok(result) => { - matches!(&result[0], postgres::SimpleQueryMessage::Row(_)) - }, - Err(e) => - { - match e.code() { - Some(&SqlState::UNDEFINED_TABLE) => false, - _ => { - // We don't expect any other error here, except for the schema/table not existing - error!("Error checking if drop subscription operation was already performed: {}", e); - return Err(e.into()); - } - } - } - } - }; - - - let jwks_roles = Arc::new( - spec.as_ref() - .local_proxy_config - .iter() - .flat_map(|it| &it.jwks) - .flatten() - .flat_map(|setting| &setting.role_names) - .cloned() - .collect::>(), - ); - - let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext { - roles, - dbs: databases, - })); - - // Apply special pre drop database phase. - // NOTE: we use the code of RunInEachDatabase phase for parallelism - // and connection management, but we don't really run it in *each* database, - // only in databases, we're about to drop. - info!("Applying PerDatabase (pre-dropdb) phase"); - let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); - - // Run the phase for each database that we're about to drop. - let db_processes = spec - .delta_operations - .iter() - .flatten() - .filter_map(move |op| { - if op.action.as_str() == "delete_db" { - Some(op.name.clone()) - } else { - None - } - }) - .map(|dbname| { - let spec = spec.clone(); - let ctx = ctx.clone(); - let jwks_roles = jwks_roles.clone(); - let mut conf = conf.as_ref().clone(); - let concurrency_token = concurrency_token.clone(); - // We only need dbname field for this phase, so set other fields to dummy values - let db = DB::UserDB(Database { - name: dbname.clone(), - owner: "cloud_admin".to_string(), - options: None, - restrict_conn: false, - invalid: false, - }); - - debug!("Applying per-database phases for Database {:?}", &db); - - match &db { - DB::SystemDB => {} - DB::UserDB(db) => { - conf.dbname(db.name.as_str()); - } - } - - let conf = Arc::new(conf); - let fut = Self::apply_spec_sql_db( - spec.clone(), - conf, - ctx.clone(), - jwks_roles.clone(), - concurrency_token.clone(), - db, - [DropLogicalSubscriptions].to_vec(), - ); - - Ok(spawn(fut)) - }) - .collect::>>(); - - for process in db_processes.into_iter() { - let handle = process?; - if let Err(e) = handle.await? { - // Handle the error case where the database does not exist - // We do not check whether the DB exists or not in the deletion phase, - // so we shouldn't be strict about it in pre-deletion cleanup as well. - if e.to_string().contains("does not exist") { - warn!("Error dropping subscription: {}", e); - } else { - return Err(e); - } - }; - } - - for phase in [ - CreateSuperUser, - DropInvalidDatabases, - RenameRoles, - CreateAndAlterRoles, - RenameAndDeleteDatabases, - CreateAndAlterDatabases, - CreateSchemaNeon, - ] { - info!("Applying phase {:?}", &phase); - apply_operations( - spec.clone(), - ctx.clone(), - jwks_roles.clone(), - phase, - || async { Ok(&client) }, - ) - .await?; - } - - info!("Applying RunInEachDatabase2 phase"); - let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); - - let db_processes = spec - .cluster - .databases - .iter() - .map(|db| DB::new(db.clone())) - // include - .chain(once(DB::SystemDB)) - .map(|db| { - let spec = spec.clone(); - let ctx = ctx.clone(); - let jwks_roles = jwks_roles.clone(); - let mut conf = conf.as_ref().clone(); - let concurrency_token = concurrency_token.clone(); - let db = db.clone(); - - debug!("Applying per-database phases for Database {:?}", &db); - - match &db { - DB::SystemDB => {} - DB::UserDB(db) => { - conf.dbname(db.name.as_str()); - } - } - - let conf = Arc::new(conf); - let mut phases = vec![ - DeleteDBRoleReferences, - ChangeSchemaPerms, - ]; - - if spec.drop_subscriptions_before_start && !drop_subscriptions_done { - info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); - phases.push(DropLogicalSubscriptions); - } - - let fut = Self::apply_spec_sql_db( - spec.clone(), - conf, - ctx.clone(), - jwks_roles.clone(), - concurrency_token.clone(), - db, - phases, - ); - - Ok(spawn(fut)) - }) - .collect::>>(); - - for process in db_processes.into_iter() { - let handle = process?; - handle.await??; - } - - let mut phases = vec![ - HandleOtherExtensions, - HandleNeonExtension, // This step depends on CreateSchemaNeon - CreateAvailabilityCheck, - DropRoles, - ]; - - // This step depends on CreateSchemaNeon - if spec.drop_subscriptions_before_start && !drop_subscriptions_done { - info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); - phases.push(FinalizeDropLogicalSubscriptions); - } - - for phase in phases { - debug!("Applying phase {:?}", &phase); - apply_operations( - spec.clone(), - ctx.clone(), - jwks_roles.clone(), - phase, - || async { Ok(&client) }, - ) - .await?; - } - - Ok::<(), anyhow::Error>(()) - })?; - - Ok(()) - } - - /// Apply SQL migrations of the RunInEachDatabase phase. - /// - /// May opt to not connect to databases that don't have any scheduled - /// operations. The function is concurrency-controlled with the provided - /// semaphore. The caller has to make sure the semaphore isn't exhausted. - async fn apply_spec_sql_db( - spec: Arc, - conf: Arc, - ctx: Arc>, - jwks_roles: Arc>, - concurrency_token: Arc, - db: DB, - subphases: Vec, - ) -> Result<()> { - let _permit = concurrency_token.acquire().await?; - - let mut client_conn = None; - - for subphase in subphases { - apply_operations( - spec.clone(), - ctx.clone(), - jwks_roles.clone(), - RunInEachDatabase { - db: db.clone(), - subphase, - }, - // Only connect if apply_operation actually wants a connection. - // It's quite possible this database doesn't need any queries, - // so by not connecting we save time and effort connecting to - // that database. - || async { - if client_conn.is_none() { - let db_client = Self::get_maintenance_client(&conf).await?; - client_conn.replace(db_client); - } - let client = client_conn.as_ref().unwrap(); - Ok(client) - }, - ) - .await?; - } - - drop(client_conn); - - Ok::<(), anyhow::Error>(()) - } - - /// Choose how many concurrent connections to use for applying the spec changes. - pub fn max_service_connections( - &self, - compute_state: &ComputeState, - spec: &ComputeSpec, - ) -> usize { - // If the cluster is in Init state we don't have to deal with user connections, - // and can thus use all `max_connections` connection slots. However, that's generally not - // very efficient, so we generally still limit it to a smaller number. - if compute_state.status == ComputeStatus::Init { - // If the settings contain 'max_connections', use that as template - if let Some(config) = spec.cluster.settings.find("max_connections") { - config.parse::().ok() - } else { - // Otherwise, try to find the setting in the postgresql_conf string - spec.cluster - .postgresql_conf - .iter() - .flat_map(|conf| conf.split("\n")) - .filter_map(|line| { - if !line.contains("max_connections") { - return None; - } - - let (key, value) = line.split_once("=")?; - let key = key - .trim_start_matches(char::is_whitespace) - .trim_end_matches(char::is_whitespace); - - let value = value - .trim_start_matches(char::is_whitespace) - .trim_end_matches(char::is_whitespace); - - if key != "max_connections" { - return None; - } - - value.parse::().ok() - }) - .next() - } - // If max_connections is present, use at most 1/3rd of that. - // When max_connections is lower than 30, try to use at least 10 connections, but - // never more than max_connections. - .map(|limit| match limit { - 0..10 => limit, - 10..30 => 10, - 30.. => limit / 3, - }) - // If we didn't find max_connections, default to 10 concurrent connections. - .unwrap_or(10) - } else { - // state == Running - // Because the cluster is already in the Running state, we should assume users are - // already connected to the cluster, and high concurrency could negatively - // impact user connectivity. Therefore, we can limit concurrency to the number of - // reserved superuser connections, which users wouldn't be able to use anyway. - spec.cluster - .settings - .find("superuser_reserved_connections") - .iter() - .filter_map(|val| val.parse::().ok()) - .map(|val| if val > 1 { val - 1 } else { 1 }) - .last() - .unwrap_or(3) - } - } - /// Do initial configuration of the already started Postgres. #[instrument(skip_all)] pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { @@ -1316,7 +1456,7 @@ impl ComputeNode { // Merge-apply spec & changes to PostgreSQL state. self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?; - if let Some(ref local_proxy) = &spec.clone().local_proxy_config { + if let Some(local_proxy) = &spec.clone().local_proxy_config { info!("configuring local_proxy"); local_proxy::configure(local_proxy).context("apply_config local_proxy")?; } @@ -1353,9 +1493,12 @@ impl ComputeNode { // `pg_ctl` for start / stop. #[instrument(skip_all)] fn pg_reload_conf(&self) -> Result<()> { - let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl"); + let pgctl_bin = Path::new(&self.params.pgbin) + .parent() + .unwrap() + .join("pg_ctl"); Command::new(pgctl_bin) - .args(["reload", "-D", &self.pgdata]) + .args(["reload", "-D", &self.params.pgdata]) .output() .expect("cannot run pg_ctl process"); Ok(()) @@ -1370,11 +1513,13 @@ impl ComputeNode { if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); + let pgbouncer_settings = pgbouncer_settings.clone(); + let tls_config = self.compute_ctl_config.tls.clone(); + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. - let pgbouncer_settings = pgbouncer_settings.clone(); tokio::spawn(async move { - let res = tune_pgbouncer(pgbouncer_settings).await; + let res = tune_pgbouncer(pgbouncer_settings, tls_config).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -1386,7 +1531,8 @@ impl ComputeNode { // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. - let local_proxy = local_proxy.clone(); + let mut local_proxy = local_proxy.clone(); + local_proxy.tls = self.compute_ctl_config.tls.clone(); tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); @@ -1395,9 +1541,13 @@ impl ComputeNode { } // Write new config - let pgdata_path = Path::new(&self.pgdata); - let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?; + let pgdata_path = Path::new(&self.params.pgdata); + config::write_postgres_conf( + pgdata_path, + &spec, + self.params.internal_http_port, + &self.compute_ctl_config.tls, + )?; if !spec.skip_pg_catalog_updates { let max_concurrent_connections = spec.reconfigure_concurrency; @@ -1408,7 +1558,8 @@ impl ComputeNode { self.pg_reload_conf()?; if spec.mode == ComputeMode::Primary { - let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); + let mut conf = + tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap(); conf.application_name("apply_config"); let conf = Arc::new(conf); @@ -1434,164 +1585,87 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute( - &self, - ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { - let compute_state = self.state.lock().unwrap().clone(); + pub fn configure_as_primary(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); - info!( - "starting compute for project {}, operation {}, tenant {}, timeline {}", - pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), - pspec.spec.operation_uuid.as_deref().unwrap_or("None"), - pspec.tenant_id, - pspec.timeline_id, - ); - // tune pgbouncer - if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { - info!("tuning pgbouncer"); - - // Spawn a background task to do the tuning, - // so that we don't block the main thread that starts Postgres. - let pgbouncer_settings = pgbouncer_settings.clone(); - let _handle = tokio::spawn(async move { - let res = tune_pgbouncer(pgbouncer_settings).await; - if let Err(err) = res { - error!("error while tuning pgbouncer: {err:?}"); - } - }); - } - - if let Some(local_proxy) = &pspec.spec.local_proxy_config { - info!("configuring local_proxy"); - - // Spawn a background task to do the configuration, - // so that we don't block the main thread that starts Postgres. - let local_proxy = local_proxy.clone(); - let _handle = tokio::spawn(async move { - if let Err(err) = local_proxy::configure(&local_proxy) { - error!("error while configuring local_proxy: {err:?}"); - } - }); - } - - info!( - "start_compute spec.remote_extensions {:?}", - pspec.spec.remote_extensions - ); - - // This part is sync, because we need to download - // remote shared_preload_libraries before postgres start (if any) - if let Some(remote_extensions) = &pspec.spec.remote_extensions { - // First, create control files for all availale extensions - extension_server::create_control_files(remote_extensions, &self.pgbin); - - let library_load_start_time = Utc::now(); - let rt = tokio::runtime::Handle::current(); - let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?; - - let library_load_time = Utc::now() - .signed_duration_since(library_load_start_time) - .to_std() - .unwrap() - .as_millis() as u64; - let mut state = self.state.lock().unwrap(); - state.metrics.load_ext_ms = library_load_time; - state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded; - state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size; - state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size; - info!( - "Loading shared_preload_libraries took {:?}ms", - library_load_time - ); - info!("{:?}", remote_ext_metrics); - } - - self.prepare_pgdata(&compute_state)?; - - let start_time = Utc::now(); - let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; - - let config_time = Utc::now(); - if pspec.spec.mode == ComputeMode::Primary { - if !pspec.spec.skip_pg_catalog_updates { - let pgdata_path = Path::new(&self.pgdata); - // temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are applying config: - // creating new extensions, roles, etc... - config::with_compute_ctl_tmp_override( - pgdata_path, - "neon.max_cluster_size=-1", - || { - self.pg_reload_conf()?; - - self.apply_config(&compute_state)?; - - Ok(()) - }, - )?; - - let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - if config::line_in_file( - &postgresql_conf_path, - "neon.disable_logical_replication_subscribers=false", - )? { - info!("updated postgresql.conf to set neon.disable_logical_replication_subscribers=false"); - } + assert!(pspec.spec.mode == ComputeMode::Primary); + if !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.params.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { self.pg_reload_conf()?; - } - self.post_apply_config()?; - let conf = self.get_conn_conf(None); - tokio::task::spawn_blocking(|| { - let res = get_installed_extensions(conf); - match res { - Ok(extensions) => { - info!( - "[NEON_EXT_STAT] {}", - serde_json::to_string(&extensions) - .expect("failed to serialize extensions list") - ); + self.apply_config(compute_state)?; + + Ok(()) + })?; + + let postgresql_conf_path = pgdata_path.join("postgresql.conf"); + if config::line_in_file( + &postgresql_conf_path, + "neon.disable_logical_replication_subscribers=false", + )? { + info!( + "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false" + ); + } + self.pg_reload_conf()?; + } + self.post_apply_config()?; + + Ok(()) + } + + pub async fn watch_cert_for_changes(self: Arc) { + // update status on cert renewal + if let Some(tls_config) = &self.compute_ctl_config.tls { + let tls_config = tls_config.clone(); + + // wait until the cert exists. + let mut cert_watch = watch_cert_for_changes(tls_config.cert_path.clone()).await; + + tokio::task::spawn_blocking(move || { + let handle = tokio::runtime::Handle::current(); + 'cert_update: loop { + // let postgres/pgbouncer/local_proxy know the new cert/key exists. + // we need to wait until it's configurable first. + + let mut state = self.state.lock().unwrap(); + 'status_update: loop { + match state.status { + // let's update the state to config pending + ComputeStatus::ConfigurationPending | ComputeStatus::Running => { + state.set_status( + ComputeStatus::ConfigurationPending, + &self.state_changed, + ); + break 'status_update; + } + + // exit loop + ComputeStatus::Failed + | ComputeStatus::TerminationPending + | ComputeStatus::Terminated => break 'cert_update, + + // wait + ComputeStatus::Init + | ComputeStatus::Configuration + | ComputeStatus::Empty => { + state = self.state_changed.wait(state).unwrap(); + } + } + } + drop(state); + + // wait for a new certificate update + if handle.block_on(cert_watch.changed()).is_err() { + break; } - Err(err) => error!("could not get installed extensions: {err:?}"), } }); } - - let startup_end_time = Utc::now(); - { - let mut state = self.state.lock().unwrap(); - state.metrics.start_postgres_ms = config_time - .signed_duration_since(start_time) - .to_std() - .unwrap() - .as_millis() as u64; - state.metrics.config_ms = startup_end_time - .signed_duration_since(config_time) - .to_std() - .unwrap() - .as_millis() as u64; - state.metrics.total_startup_ms = startup_end_time - .signed_duration_since(compute_state.start_time) - .to_std() - .unwrap() - .as_millis() as u64; - } - self.set_status(ComputeStatus::Running); - - info!( - "finished configuration of compute for project {}", - pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None") - ); - - // Log metrics so that we can search for slow operations in logs - let metrics = { - let state = self.state.lock().unwrap(); - state.metrics.clone() - }; - info!(?metrics, "compute start finished"); - - Ok(pg_process) } /// Update the `last_active` in the shared state, but ensure that it's a more recent one. @@ -1620,7 +1694,7 @@ impl ComputeNode { pub fn check_for_core_dumps(&self) -> Result<()> { let core_dump_dir = match std::env::consts::OS { "macos" => Path::new("/cores/"), - _ => Path::new(&self.pgdata), + _ => Path::new(&self.params.pgdata), }; // Collect core dump paths if any @@ -1650,7 +1724,7 @@ impl ComputeNode { // Try first with gdb let backtrace = Command::new("gdb") - .args(["--batch", "-q", "-ex", "bt", &self.pgbin]) + .args(["--batch", "-q", "-ex", "bt", &self.params.pgbin]) .arg(&core_path) .output(); @@ -1727,7 +1801,8 @@ LIMIT 100", ext_path: RemotePath, ) -> Result { let ext_remote_storage = - self.ext_remote_storage + self.params + .ext_remote_storage .as_ref() .ok_or(DownloadError::BadInput(anyhow::anyhow!( "Remote extensions storage is not configured", @@ -1763,7 +1838,9 @@ LIMIT 100", info!("extension already downloaded, skipping re-download"); return Ok(0); } else if start_time_delta < HANG_TIMEOUT && !first_try { - info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout"); + info!( + "download {ext_archive_name} already started by another process, hanging untill completion or timeout" + ); let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500)); loop { info!("waiting for download"); @@ -1788,7 +1865,7 @@ LIMIT 100", &real_ext_name, &ext_path, ext_remote_storage, - &self.pgbin, + &self.params.pgbin, ) .await .map_err(DownloadError::Other); @@ -1896,7 +1973,7 @@ LIMIT 100", &self, spec: &ComputeSpec, ) -> Result { - if self.ext_remote_storage.is_none() { + if self.params.ext_remote_storage.is_none() { return Ok(RemoteExtensionMetrics { num_ext_downloaded: 0, largest_ext_size: 0, @@ -1947,8 +2024,12 @@ LIMIT 100", let mut download_tasks = Vec::new(); for library in &libs_vec { - let (ext_name, ext_path) = - remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?; + let (ext_name, ext_path) = remote_extensions.get_ext( + library, + true, + &self.params.build_tag, + &self.params.pgversion, + )?; download_tasks.push(self.download_extension(ext_name, ext_path)); } let results = join_all(download_tasks).await; @@ -2025,3 +2106,26 @@ pub fn forward_termination_signal() { kill(pg_pid, Signal::SIGINT).ok(); } } + +// helper trait to call JoinSet::spawn_blocking(f), but propagates the current +// tracing span to the thread. +trait JoinSetExt { + fn spawn_blocking_child(&mut self, f: F) -> tokio::task::AbortHandle + where + F: FnOnce() -> T + Send + 'static, + T: Send; +} + +impl JoinSetExt for tokio::task::JoinSet { + fn spawn_blocking_child(&mut self, f: F) -> tokio::task::AbortHandle + where + F: FnOnce() -> T + Send + 'static, + T: Send, + { + let sp = tracing::Span::current(); + self.spawn_blocking(move || { + let _e = sp.enter(); + f() + }) + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index e1bdfffa54..290632e4cd 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -1,13 +1,18 @@ +use anyhow::Result; +use std::fmt::Write as FmtWrite; use std::fs::{File, OpenOptions}; use std::io; +use std::io::Write; use std::io::prelude::*; use std::path::Path; -use anyhow::Result; +use compute_api::responses::TlsConfig; +use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption}; -use crate::pg_helpers::escape_conf_value; -use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; -use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; +use crate::pg_helpers::{ + GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, +}; +use crate::tls::{self, SERVER_CRT, SERVER_KEY}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -35,10 +40,12 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { /// Create or completely rewrite configuration file specified by `path` pub fn write_postgres_conf( - path: &Path, + pgdata_path: &Path, spec: &ComputeSpec, extension_server_port: u16, + tls_config: &Option, ) -> Result<()> { + let path = pgdata_path.join("postgresql.conf"); // File::create() destroys the file content if it exists. let mut file = File::create(path)?; @@ -56,10 +63,20 @@ pub fn write_postgres_conf( writeln!(file, "neon.stripe_size={stripe_size}")?; } if !spec.safekeeper_connstrings.is_empty() { + let mut neon_safekeepers_value = String::new(); + tracing::info!( + "safekeepers_connstrings is not zero, gen: {:?}", + spec.safekeepers_generation + ); + // If generation is given, prepend sk list with g#number: + if let Some(generation) = spec.safekeepers_generation { + write!(neon_safekeepers_value, "g#{}:", generation)?; + } + neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(",")); writeln!( file, "neon.safekeepers={}", - escape_conf_value(&spec.safekeeper_connstrings.join(",")) + escape_conf_value(&neon_safekeepers_value) )?; } if let Some(s) = &spec.tenant_id { @@ -73,6 +90,20 @@ pub fn write_postgres_conf( )?; } + // tls + if let Some(tls_config) = tls_config { + writeln!(file, "ssl = on")?; + + // postgres requires the keyfile to be in a secure file, + // currently too complicated to ensure that at the VM level, + // so we just copy them to another file instead. :shrug: + tls::update_key_path_blocking(pgdata_path, tls_config); + + // these are the default, but good to be explicit. + writeln!(file, "ssl_cert_file = '{}'", SERVER_CRT)?; + writeln!(file, "ssl_key_file = '{}'", SERVER_KEY)?; + } + // Locales if cfg!(target_os = "macos") { writeln!(file, "lc_messages='C'")?; @@ -127,6 +158,55 @@ pub fn write_postgres_conf( writeln!(file, "# Managed by compute_ctl: end")?; } + // If audit logging is enabled, configure pgaudit. + // + // Note, that this is called after the settings from spec are written. + // This way we always override the settings from the spec + // and don't allow the user or the control plane admin to change them. + if let ComputeAudit::Hipaa = spec.audit_log_level { + writeln!(file, "# Managed by compute_ctl audit settings: begin")?; + // This log level is very verbose + // but this is necessary for HIPAA compliance. + // Exclude 'misc' category, because it doesn't contain anythig relevant. + writeln!(file, "pgaudit.log='all, -misc'")?; + writeln!(file, "pgaudit.log_parameter=on")?; + // Disable logging of catalog queries + // The catalog doesn't contain sensitive data, so we don't need to audit it. + writeln!(file, "pgaudit.log_catalog=off")?; + // Set log rotation to 5 minutes + // TODO: tune this after performance testing + writeln!(file, "pgaudit.log_rotation_age=5")?; + + // Add audit shared_preload_libraries, if they are not present. + // + // The caller who sets the flag is responsible for ensuring that the necessary + // shared_preload_libraries are present in the compute image, + // otherwise the compute start will fail. + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + let mut extra_shared_preload_libraries = String::new(); + if !libs.contains("pgaudit") { + extra_shared_preload_libraries.push_str(",pgaudit"); + } + if !libs.contains("pgauditlogtofile") { + extra_shared_preload_libraries.push_str(",pgauditlogtofile"); + } + writeln!( + file, + "shared_preload_libraries='{}{}'", + libs, extra_shared_preload_libraries + )?; + } else { + // Typically, this should be unreacheable, + // because we always set at least some shared_preload_libraries in the spec + // but let's handle it explicitly anyway. + writeln!( + file, + "shared_preload_libraries='neon,pgaudit,pgauditlogtofile'" + )?; + } + writeln!(file, "# Managed by compute_ctl audit settings: end")?; + } + writeln!(file, "neon.extension_server_port={}", extension_server_port)?; if spec.drop_subscriptions_before_start { @@ -136,6 +216,12 @@ pub fn write_postgres_conf( writeln!(file, "neon.disable_logical_replication_subscribers=false")?; } + // We need Postgres to send logs to rsyslog so that we can forward them + // further to customers' log aggregation systems. + if spec.features.contains(&ComputeFeature::PostgresLogsExport) { + writeln!(file, "log_destination='stderr,syslog'")?; + } + // This is essential to keep this line at the end of the file, // because it is intended to override any settings above. writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?; diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf new file mode 100644 index 0000000000..9ca7e36738 --- /dev/null +++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf @@ -0,0 +1,11 @@ +# Load imfile module to read log files +module(load="imfile") + +# Input configuration for log files in the specified directory +# Replace {log_directory} with the directory containing the log files +input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0") +# the directory to store rsyslog state files +global(workDirectory="/var/log/rsyslog") + +# Forward logs to remote syslog server +*.* @@{remote_endpoint} diff --git a/compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf b/compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf new file mode 100644 index 0000000000..2580b61fea --- /dev/null +++ b/compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf @@ -0,0 +1,10 @@ +# Program name comes from postgres' syslog_facility configuration: https://www.postgresql.org/docs/current/runtime-config-logging.html#GUC-SYSLOG-IDENT +# Default value is 'postgres'. +if $programname == 'postgres' then {{ + # Forward Postgres logs to telemetry otel collector + action(type="omfwd" target="{logs_export_target}" port="{logs_export_port}" protocol="tcp" + template="RSYSLOG_SyslogProtocol23Format" + action.resumeRetryCount="3" + queue.type="linkedList" queue.size="1000") + stop +}} diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index d88f26ca20..d97bd37285 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -1,9 +1,8 @@ use std::sync::Arc; use std::thread; -use tracing::{error, info, instrument}; - use compute_api::responses::ComputeStatus; +use tracing::{error, info, instrument}; use crate::compute::ComputeNode; diff --git a/compute_tools/src/disk_quota.rs b/compute_tools/src/disk_quota.rs index e838c5b9fd..1353ab938d 100644 --- a/compute_tools/src/disk_quota.rs +++ b/compute_tools/src/disk_quota.rs @@ -1,9 +1,11 @@ use anyhow::Context; +use tracing::instrument; pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota"; /// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes. /// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set. +#[instrument] pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> { let size_kb = size_bytes / 1024; // run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}` diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 00f46386e7..ee889e0c40 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -71,15 +71,15 @@ More specifically, here is an example ext_index.json } } */ -use anyhow::Result; -use anyhow::{bail, Context}; +use std::path::Path; +use std::str; + +use anyhow::{Context, Result, bail}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; use regex::Regex; use remote_storage::*; use reqwest::StatusCode; -use std::path::Path; -use std::str; use tar::Archive; use tracing::info; use tracing::log::warn; @@ -202,8 +202,24 @@ pub async fn download_extension( // move contents of the libdir / sharedir in unzipped archive to the correct local paths for paths in [sharedir_paths, libdir_paths] { let (zip_dir, real_dir) = paths; + + let dir = match std::fs::read_dir(&zip_dir) { + Ok(dir) => dir, + Err(e) => match e.kind() { + // In the event of a SQL-only extension, there would be nothing + // to move from the lib/ directory, so note that in the log and + // move on. + std::io::ErrorKind::NotFound => { + info!("nothing to move from {}", zip_dir); + continue; + } + _ => return Err(anyhow::anyhow!(e)), + }, + }; + info!("mv {zip_dir:?}/* {real_dir:?}"); - for file in std::fs::read_dir(zip_dir)? { + + for file in dir { let old_file = file?.path(); let new_file = Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?); @@ -244,33 +260,40 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { info!("writing file {:?}{:?}", control_path, control_content); std::fs::write(control_path, control_content).unwrap(); } else { - warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path); + warn!( + "control file {:?} exists both locally and remotely. ignoring the remote version.", + control_path + ); } } } } -// Do request to extension storage proxy, i.e. +// Do request to extension storage proxy, e.g., // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst -// using HHTP GET -// and return the response body as bytes -// +// using HTTP GET and return the response body as bytes. async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { let uri = format!("{}/{}", ext_remote_storage, ext_path); + let filename = Path::new(ext_path) + .file_name() + .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) + .to_str() + .unwrap_or("unknown") + .to_string(); - info!("Download extension {} from uri {}", ext_path, uri); + info!("Downloading extension file '{}' from uri {}", filename, uri); match do_extension_server_request(&uri).await { Ok(resp) => { info!("Successfully downloaded remote extension data {}", ext_path); REMOTE_EXT_REQUESTS_TOTAL - .with_label_values(&[&StatusCode::OK.to_string()]) + .with_label_values(&[&StatusCode::OK.to_string(), &filename]) .inc(); Ok(resp) } Err((msg, status)) => { REMOTE_EXT_REQUESTS_TOTAL - .with_label_values(&[&status]) + .with_label_values(&[&status, &filename]) .inc(); bail!(msg); } diff --git a/compute_tools/src/http/extract/json.rs b/compute_tools/src/http/extract/json.rs index 104cc25d5f..1d32e4ff37 100644 --- a/compute_tools/src/http/extract/json.rs +++ b/compute_tools/src/http/extract/json.rs @@ -1,6 +1,7 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::JsonRejection, FromRequest, Request}; +use axum::extract::rejection::JsonRejection; +use axum::extract::{FromRequest, Request}; use compute_api::responses::GenericAPIError; use http::StatusCode; diff --git a/compute_tools/src/http/extract/mod.rs b/compute_tools/src/http/extract/mod.rs index 1b690e444d..589681cfe2 100644 --- a/compute_tools/src/http/extract/mod.rs +++ b/compute_tools/src/http/extract/mod.rs @@ -1,7 +1,9 @@ pub(crate) mod json; pub(crate) mod path; pub(crate) mod query; +pub(crate) mod request_id; pub(crate) use json::Json; pub(crate) use path::Path; pub(crate) use query::Query; +pub(crate) use request_id::RequestId; diff --git a/compute_tools/src/http/extract/path.rs b/compute_tools/src/http/extract/path.rs index 09637a96a4..45970cff3d 100644 --- a/compute_tools/src/http/extract/path.rs +++ b/compute_tools/src/http/extract/path.rs @@ -1,8 +1,10 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::PathRejection, FromRequestParts}; +use axum::extract::FromRequestParts; +use axum::extract::rejection::PathRejection; use compute_api::responses::GenericAPIError; -use http::{request::Parts, StatusCode}; +use http::StatusCode; +use http::request::Parts; /// Custom `Path` extractor, so that we can format errors into /// `JsonResponse`. diff --git a/compute_tools/src/http/extract/query.rs b/compute_tools/src/http/extract/query.rs index 9dec3642cf..b8079ea770 100644 --- a/compute_tools/src/http/extract/query.rs +++ b/compute_tools/src/http/extract/query.rs @@ -1,8 +1,10 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::QueryRejection, FromRequestParts}; +use axum::extract::FromRequestParts; +use axum::extract::rejection::QueryRejection; use compute_api::responses::GenericAPIError; -use http::{request::Parts, StatusCode}; +use http::StatusCode; +use http::request::Parts; /// Custom `Query` extractor, so that we can format errors into /// `JsonResponse`. diff --git a/compute_tools/src/http/extract/request_id.rs b/compute_tools/src/http/extract/request_id.rs new file mode 100644 index 0000000000..d911921a05 --- /dev/null +++ b/compute_tools/src/http/extract/request_id.rs @@ -0,0 +1,86 @@ +use std::{ + fmt::Display, + ops::{Deref, DerefMut}, +}; + +use axum::{extract::FromRequestParts, response::IntoResponse}; +use http::{StatusCode, request::Parts}; + +use crate::http::{JsonResponse, headers::X_REQUEST_ID}; + +/// Extract the request ID from the `X-Request-Id` header. +#[derive(Debug, Clone, Default)] +pub(crate) struct RequestId(pub String); + +#[derive(Debug)] +/// Rejection used for [`RequestId`]. +/// +/// Contains one variant for each way the [`RequestId`] extractor can +/// fail. +pub(crate) enum RequestIdRejection { + /// The request is missing the header. + MissingRequestId, + + /// The value of the header is invalid UTF-8. + InvalidUtf8, +} + +impl RequestIdRejection { + pub fn status(&self) -> StatusCode { + match self { + RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR, + RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST, + } + } + + pub fn message(&self) -> String { + match self { + RequestIdRejection::MissingRequestId => "request ID is missing", + RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8", + } + .to_string() + } +} + +impl IntoResponse for RequestIdRejection { + fn into_response(self) -> axum::response::Response { + JsonResponse::error(self.status(), self.message()) + } +} + +impl FromRequestParts for RequestId +where + S: Send + Sync, +{ + type Rejection = RequestIdRejection; + + async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result { + match parts.headers.get(X_REQUEST_ID) { + Some(value) => match value.to_str() { + Ok(request_id) => Ok(Self(request_id.to_string())), + Err(_) => Err(RequestIdRejection::InvalidUtf8), + }, + None => Err(RequestIdRejection::MissingRequestId), + } + } +} + +impl Deref for RequestId { + type Target = String; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for RequestId { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl Display for RequestId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.0) + } +} diff --git a/compute_tools/src/http/headers.rs b/compute_tools/src/http/headers.rs new file mode 100644 index 0000000000..a11638e203 --- /dev/null +++ b/compute_tools/src/http/headers.rs @@ -0,0 +1,2 @@ +/// Constant for `X-Request-Id` header. +pub const X_REQUEST_ID: &str = "x-request-id"; diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs new file mode 100644 index 0000000000..798dd1179b --- /dev/null +++ b/compute_tools/src/http/middleware/authorize.rs @@ -0,0 +1,145 @@ +use std::{collections::HashSet, net::SocketAddr}; + +use anyhow::{Result, anyhow}; +use axum::{RequestExt, body::Body, extract::ConnectInfo}; +use axum_extra::{ + TypedHeader, + headers::{Authorization, authorization::Bearer}, +}; +use futures::future::BoxFuture; +use http::{Request, Response, StatusCode}; +use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet}; +use serde::Deserialize; +use tower_http::auth::AsyncAuthorizeRequest; +use tracing::warn; + +use crate::http::{JsonResponse, extract::RequestId}; + +#[derive(Clone, Debug, Deserialize)] +pub(in crate::http) struct Claims { + compute_id: String, +} + +#[derive(Clone, Debug)] +pub(in crate::http) struct Authorize { + compute_id: String, + jwks: JwkSet, + validation: Validation, +} + +impl Authorize { + pub fn new(compute_id: String, jwks: JwkSet) -> Self { + let mut validation = Validation::new(Algorithm::EdDSA); + // Nothing is currently required + validation.required_spec_claims = HashSet::new(); + validation.validate_exp = true; + // Unused by the control plane + validation.validate_aud = false; + // Unused by the control plane + validation.validate_nbf = false; + + Self { + compute_id, + jwks, + validation, + } + } +} + +impl AsyncAuthorizeRequest for Authorize { + type RequestBody = Body; + type ResponseBody = Body; + type Future = BoxFuture<'static, Result, Response>>; + + fn authorize(&mut self, mut request: Request) -> Self::Future { + let compute_id = self.compute_id.clone(); + let jwks = self.jwks.clone(); + let validation = self.validation.clone(); + + Box::pin(async move { + let request_id = request.extract_parts::().await.unwrap(); + + // TODO: Remove this check after a successful rollout + if jwks.keys.is_empty() { + warn!(%request_id, "Authorization has not been configured"); + + return Ok(request); + } + + let connect_info = request + .extract_parts::>() + .await + .unwrap(); + + // In the event the request is coming from the loopback interface, + // allow all requests + if connect_info.ip().is_loopback() { + warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface"); + + return Ok(request); + } + + let TypedHeader(Authorization(bearer)) = request + .extract_parts::>>() + .await + .map_err(|_| { + JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token") + })?; + + let data = match Self::verify(&jwks, bearer.token(), &validation) { + Ok(claims) => claims, + Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)), + }; + + if data.claims.compute_id != compute_id { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "invalid claims in authorization token", + )); + } + + // Make claims available to any subsequent middleware or request + // handlers + request.extensions_mut().insert(data.claims); + + Ok(request) + }) + } +} + +impl Authorize { + /// Verify the token using the JSON Web Key set and return the token data. + fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result> { + debug_assert!(!jwks.keys.is_empty()); + + for jwk in jwks.keys.iter() { + let decoding_key = match DecodingKey::from_jwk(jwk) { + Ok(key) => key, + Err(e) => { + warn!( + "Failed to construct decoding key from {}: {}", + jwk.common.key_id.as_ref().unwrap(), + e + ); + + continue; + } + }; + + match jsonwebtoken::decode::(token, &decoding_key, validation) { + Ok(data) => return Ok(data), + Err(e) => { + warn!( + "Failed to decode authorization token using {}: {}", + jwk.common.key_id.as_ref().unwrap(), + e + ); + + continue; + } + } + } + + Err(anyhow!("Failed to verify authorization token")) + } +} diff --git a/compute_tools/src/http/middleware/mod.rs b/compute_tools/src/http/middleware/mod.rs new file mode 100644 index 0000000000..147d6d2c7d --- /dev/null +++ b/compute_tools/src/http/middleware/mod.rs @@ -0,0 +1,2 @@ +pub(in crate::http) mod authorize; +pub(in crate::http) mod request_id; diff --git a/compute_tools/src/http/middleware/request_id.rs b/compute_tools/src/http/middleware/request_id.rs new file mode 100644 index 0000000000..e685b27d91 --- /dev/null +++ b/compute_tools/src/http/middleware/request_id.rs @@ -0,0 +1,16 @@ +use axum::{extract::Request, middleware::Next, response::Response}; +use uuid::Uuid; + +use crate::http::headers::X_REQUEST_ID; + +/// This middleware function allows compute_ctl to generate its own request ID +/// if one isn't supplied. The control plane will always send one as a UUID. The +/// neon Postgres extension on the other hand does not send one. +pub async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { + let headers = request.headers_mut(); + if !headers.contains_key(X_REQUEST_ID) { + headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); + } + + next.run(request).await +} diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index 93eb6ef5b7..9ecc1b0093 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -1,10 +1,14 @@ -use axum::{body::Body, response::Response}; +use axum::body::Body; +use axum::response::Response; use compute_api::responses::{ComputeStatus, GenericAPIError}; -use http::{header::CONTENT_TYPE, StatusCode}; +use http::StatusCode; +use http::header::CONTENT_TYPE; use serde::Serialize; use tracing::error; mod extract; +mod headers; +mod middleware; mod routes; pub mod server; diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index bbdb7d0917..7c8f72440f 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -306,6 +306,36 @@ paths: schema: $ref: "#/components/schemas/GenericError" + /configure_telemetry: + post: + tags: + - Configure + summary: Configure rsyslog + description: | + This API endpoint configures rsyslog to forward Postgres logs + to a specified otel collector. + operationId: configureTelemetry + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + logs_export_host: + type: string + description: | + Hostname and the port of the otel collector. Leave empty to disable logs forwarding. + Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526 + responses: + 204: + description: "Telemetry configured successfully" + 500: + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + components: securitySchemes: JWT: diff --git a/compute_tools/src/http/routes/check_writability.rs b/compute_tools/src/http/routes/check_writability.rs index d7feb055e9..5a12686fa8 100644 --- a/compute_tools/src/http/routes/check_writability.rs +++ b/compute_tools/src/http/routes/check_writability.rs @@ -1,10 +1,13 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use compute_api::responses::ComputeStatus; use http::StatusCode; -use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse}; +use crate::checker::check_writability; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Check that the compute is currently running. pub(in crate::http) async fn is_writable(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index 2546cbc344..5c9dd22c3d 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -1,18 +1,19 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::ConfigurationRequest, - responses::{ComputeStatus, ComputeStatusResponse}, -}; +use axum::body::Body; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest}; +use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; +use compute_api::spec::ComputeFeature; use http::StatusCode; use tokio::task; use tracing::info; -use crate::{ - compute::{ComputeNode, ParsedSpec}, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::{ComputeNode, ParsedSpec}; +use crate::http::JsonResponse; +use crate::http::extract::Json; +use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export}; // Accept spec in JSON format and request compute configuration. If anything // goes wrong after we set the compute status to `ConfigurationPending` and @@ -24,7 +25,7 @@ pub(in crate::http) async fn configure( State(compute): State>, request: Json, ) -> Response { - if !compute.live_config_allowed { + if !compute.params.live_config_allowed { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "live configuration is not allowed for this compute node".to_string(), @@ -47,13 +48,18 @@ pub(in crate::http) async fn configure( return JsonResponse::invalid_status(state.status); } + // Pass the tracing span to the main thread that performs the startup, + // so that the start_compute operation is considered a child of this + // configure request for tracing purposes. + state.startup_span = Some(tracing::Span::current()); + state.pspec = Some(pspec); state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed); drop(state); } // Spawn a blocking thread to wait for compute to become Running. This is - // needed to do not block the main pool of workers and be able to serve + // needed to not block the main pool of workers and to be able to serve // other requests while some particular request is waiting for compute to // finish configuration. let c = compute.clone(); @@ -89,3 +95,25 @@ pub(in crate::http) async fn configure( JsonResponse::success(StatusCode::OK, body) } + +pub(in crate::http) async fn configure_telemetry( + State(compute): State>, + request: Json, +) -> Response { + if !compute.has_feature(ComputeFeature::PostgresLogsExport) { + return JsonResponse::error( + StatusCode::PRECONDITION_FAILED, + "Postgres logs export feature is not enabled".to_string(), + ); + } + + let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref()); + if let Err(err) = configure_postgres_logs_export(conf) { + return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string()); + } + + Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::from("")) + .unwrap() +} diff --git a/compute_tools/src/http/routes/database_schema.rs b/compute_tools/src/http/routes/database_schema.rs index fd716272dc..1f6ca4b79d 100644 --- a/compute_tools/src/http/routes/database_schema.rs +++ b/compute_tools/src/http/routes/database_schema.rs @@ -1,14 +1,16 @@ use std::sync::Arc; -use axum::{body::Body, extract::State, response::Response}; -use http::{header::CONTENT_TYPE, StatusCode}; +use axum::body::Body; +use axum::extract::State; +use axum::response::Response; +use http::StatusCode; +use http::header::CONTENT_TYPE; use serde::Deserialize; -use crate::{ - catalog::{get_database_schema, SchemaDumpError}, - compute::ComputeNode, - http::{extract::Query, JsonResponse}, -}; +use crate::catalog::{SchemaDumpError, get_database_schema}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Query; #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct DatabaseSchemaParams { diff --git a/compute_tools/src/http/routes/dbs_and_roles.rs b/compute_tools/src/http/routes/dbs_and_roles.rs index 4843c3fab4..790fe0dfe3 100644 --- a/compute_tools/src/http/routes/dbs_and_roles.rs +++ b/compute_tools/src/http/routes/dbs_and_roles.rs @@ -1,9 +1,12 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use http::StatusCode; -use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse}; +use crate::catalog::get_dbs_and_roles; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Get the databases and roles from the compute. pub(in crate::http) async fn get_catalog_objects( diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 5cc9b6d277..563b73ae65 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -1,19 +1,13 @@ use std::sync::Arc; -use axum::{ - extract::State, - response::{IntoResponse, Response}, -}; +use axum::extract::State; +use axum::response::{IntoResponse, Response}; use http::StatusCode; use serde::Deserialize; -use crate::{ - compute::ComputeNode, - http::{ - extract::{Path, Query}, - JsonResponse, - }, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::{Path, Query}; #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct ExtensionServerParams { @@ -24,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams { /// Download a remote extension. pub(in crate::http) async fn download_extension( Path(filename): Path, - params: Query, + ext_server_params: Query, State(compute): State>, ) -> Response { // Don't even try to download extensions if no remote storage is configured - if compute.ext_remote_storage.is_none() { + if compute.params.ext_remote_storage.is_none() { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "remote storage is not configured", @@ -52,9 +46,9 @@ pub(in crate::http) async fn download_extension( remote_extensions.get_ext( &filename, - params.is_library, - &compute.build_tag, - &compute.pgversion, + ext_server_params.is_library, + &compute.params.build_tag, + &compute.params.pgversion, ) }; diff --git a/compute_tools/src/http/routes/extensions.rs b/compute_tools/src/http/routes/extensions.rs index 1fc03b9109..910e1fa155 100644 --- a/compute_tools/src/http/routes/extensions.rs +++ b/compute_tools/src/http/routes/extensions.rs @@ -1,16 +1,14 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::ExtensionInstallRequest, - responses::{ComputeStatus, ExtensionInstallResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::ExtensionInstallRequest; +use compute_api::responses::{ComputeStatus, ExtensionInstallResponse}; use http::StatusCode; -use crate::{ - compute::ComputeNode, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Install a extension. pub(in crate::http) async fn install_extension( diff --git a/compute_tools/src/http/routes/failpoints.rs b/compute_tools/src/http/routes/failpoints.rs index 836417d784..8f5da99963 100644 --- a/compute_tools/src/http/routes/failpoints.rs +++ b/compute_tools/src/http/routes/failpoints.rs @@ -17,7 +17,8 @@ pub struct FailpointConfig { pub actions: String, } -use crate::http::{extract::Json, JsonResponse}; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Configure failpoints for testing purposes. pub(in crate::http) async fn configure_failpoints( diff --git a/compute_tools/src/http/routes/grants.rs b/compute_tools/src/http/routes/grants.rs index 3f67f011e5..267dcbb27e 100644 --- a/compute_tools/src/http/routes/grants.rs +++ b/compute_tools/src/http/routes/grants.rs @@ -1,16 +1,14 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::SetRoleGrantsRequest, - responses::{ComputeStatus, SetRoleGrantsResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::SetRoleGrantsRequest; +use compute_api::responses::{ComputeStatus, SetRoleGrantsResponse}; use http::StatusCode; -use crate::{ - compute::ComputeNode, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Add grants for a role. pub(in crate::http) async fn add_grant( diff --git a/compute_tools/src/http/routes/insights.rs b/compute_tools/src/http/routes/insights.rs index 6b03a461c3..b1ba67161e 100644 --- a/compute_tools/src/http/routes/insights.rs +++ b/compute_tools/src/http/routes/insights.rs @@ -1,10 +1,12 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use compute_api::responses::ComputeStatus; use http::StatusCode; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Collect current Postgres usage insights. pub(in crate::http) async fn get_insights(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs index 13150a7588..da8d8b20a5 100644 --- a/compute_tools/src/http/routes/metrics.rs +++ b/compute_tools/src/http/routes/metrics.rs @@ -1,10 +1,12 @@ -use axum::{body::Body, response::Response}; -use http::header::CONTENT_TYPE; +use axum::body::Body; +use axum::response::Response; use http::StatusCode; +use http::header::CONTENT_TYPE; use metrics::proto::MetricFamily; use metrics::{Encoder, TextEncoder}; -use crate::{http::JsonResponse, metrics::collect}; +use crate::http::JsonResponse; +use crate::metrics::collect; /// Expose Prometheus metrics. pub(in crate::http) async fn get_metrics() -> Response { diff --git a/compute_tools/src/http/routes/metrics_json.rs b/compute_tools/src/http/routes/metrics_json.rs index 0709db5011..bc35ee2645 100644 --- a/compute_tools/src/http/routes/metrics_json.rs +++ b/compute_tools/src/http/routes/metrics_json.rs @@ -1,9 +1,11 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use http::StatusCode; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Get startup metrics. pub(in crate::http) async fn get_metrics(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/status.rs b/compute_tools/src/http/routes/status.rs index d64d53a58f..8ed1299d6b 100644 --- a/compute_tools/src/http/routes/status.rs +++ b/compute_tools/src/http/routes/status.rs @@ -1,9 +1,13 @@ -use std::{ops::Deref, sync::Arc}; +use std::ops::Deref; +use std::sync::Arc; -use axum::{extract::State, http::StatusCode, response::Response}; +use axum::extract::State; +use axum::http::StatusCode; +use axum::response::Response; use compute_api::responses::ComputeStatusResponse; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Retrieve the state of the comute. pub(in crate::http) async fn get_status(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs index 7acd84f236..2c24d4ad6b 100644 --- a/compute_tools/src/http/routes/terminate.rs +++ b/compute_tools/src/http/routes/terminate.rs @@ -1,18 +1,14 @@ use std::sync::Arc; -use axum::{ - extract::State, - response::{IntoResponse, Response}, -}; +use axum::extract::State; +use axum::response::{IntoResponse, Response}; use compute_api::responses::ComputeStatus; use http::StatusCode; use tokio::task; use tracing::info; -use crate::{ - compute::{forward_termination_signal, ComputeNode}, - http::JsonResponse, -}; +use crate::compute::{ComputeNode, forward_termination_signal}; +use crate::http::JsonResponse; /// Terminate the compute. pub(in crate::http) async fn terminate(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index a523ecd96f..179369e3ef 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -1,60 +1,66 @@ -use std::{ - fmt::Display, - net::{IpAddr, Ipv6Addr, SocketAddr}, - sync::Arc, - time::Duration, -}; +use std::fmt::Display; +use std::net::{IpAddr, Ipv6Addr, SocketAddr}; +use std::sync::Arc; +use std::time::Duration; use anyhow::Result; -use axum::{ - extract::Request, - middleware::{self, Next}, - response::{IntoResponse, Response}, - routing::{get, post}, - Router, -}; +use axum::Router; +use axum::middleware::{self}; +use axum::response::IntoResponse; +use axum::routing::{get, post}; +use compute_api::responses::ComputeCtlConfig; use http::StatusCode; use tokio::net::TcpListener; use tower::ServiceBuilder; -use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer}; -use tracing::{debug, error, info, Span}; -use uuid::Uuid; +use tower_http::{ + auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer, +}; +use tracing::{Span, error, info}; -use super::routes::{ - check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, - grants, insights, metrics, metrics_json, status, terminate, +use super::middleware::request_id::maybe_add_request_id_header; +use super::{ + headers::X_REQUEST_ID, + middleware::authorize::Authorize, + routes::{ + check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, + grants, insights, metrics, metrics_json, status, terminate, + }, }; use crate::compute::ComputeNode; -const X_REQUEST_ID: &str = "x-request-id"; - /// `compute_ctl` has two servers: internal and external. The internal server /// binds to the loopback interface and handles communication from clients on /// the compute. The external server is what receives communication from the /// control plane, the metrics scraper, etc. We make the distinction because /// certain routes in `compute_ctl` only need to be exposed to local processes /// like Postgres via the neon extension and local_proxy. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Debug)] pub enum Server { - Internal(u16), - External(u16), + Internal { + port: u16, + }, + External { + port: u16, + config: ComputeCtlConfig, + compute_id: String, + }, } impl Display for Server { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Server::Internal(_) => f.write_str("internal"), - Server::External(_) => f.write_str("external"), + Server::Internal { .. } => f.write_str("internal"), + Server::External { .. } => f.write_str("external"), } } } -impl From for Router> { - fn from(server: Server) -> Self { +impl From<&Server> for Router> { + fn from(server: &Server) -> Self { let mut router = Router::>::new(); router = match server { - Server::Internal(_) => { + Server::Internal { .. } => { router = router .route( "/extension_server/{*filename}", @@ -72,58 +78,72 @@ impl From for Router> { router } - Server::External(_) => router - .route("/check_writability", post(check_writability::is_writable)) - .route("/configure", post(configure::configure)) - .route("/database_schema", get(database_schema::get_schema_dump)) - .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) - .route("/insights", get(insights::get_insights)) - .route("/metrics", get(metrics::get_metrics)) - .route("/metrics.json", get(metrics_json::get_metrics)) - .route("/status", get(status::get_status)) - .route("/terminate", post(terminate::terminate)), + Server::External { + config, compute_id, .. + } => { + let unauthenticated_router = + Router::>::new().route("/metrics", get(metrics::get_metrics)); + + let authenticated_router = Router::>::new() + .route("/check_writability", post(check_writability::is_writable)) + .route("/configure", post(configure::configure)) + .route("/configure_telemetry", post(configure::configure_telemetry)) + .route("/database_schema", get(database_schema::get_schema_dump)) + .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) + .route("/insights", get(insights::get_insights)) + .route("/metrics.json", get(metrics_json::get_metrics)) + .route("/status", get(status::get_status)) + .route("/terminate", post(terminate::terminate)) + .layer(AsyncRequireAuthorizationLayer::new(Authorize::new( + compute_id.clone(), + config.jwks.clone(), + ))); + + router + .merge(unauthenticated_router) + .merge(authenticated_router) + } }; - router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer( - ServiceBuilder::new() - // Add this middleware since we assume the request ID exists - .layer(middleware::from_fn(maybe_add_request_id_header)) - .layer( - TraceLayer::new_for_http() - .on_request(|request: &http::Request<_>, _span: &Span| { - let request_id = request - .headers() - .get(X_REQUEST_ID) - .unwrap() - .to_str() - .unwrap(); - - match request.uri().path() { - "/metrics" => { - debug!(%request_id, "{} {}", request.method(), request.uri()) - } - _ => info!(%request_id, "{} {}", request.method(), request.uri()), - }; - }) - .on_response( - |response: &http::Response<_>, latency: Duration, _span: &Span| { - let request_id = response + router + .fallback(Server::handle_404) + .method_not_allowed_fallback(Server::handle_405) + .layer( + ServiceBuilder::new() + .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO)) + // Add this middleware since we assume the request ID exists + .layer(middleware::from_fn(maybe_add_request_id_header)) + .layer( + TraceLayer::new_for_http() + .on_request(|request: &http::Request<_>, _span: &Span| { + let request_id = request .headers() .get(X_REQUEST_ID) .unwrap() .to_str() .unwrap(); - info!( - %request_id, - code = response.status().as_u16(), - latency = latency.as_millis() - ) - }, - ), - ) - .layer(PropagateRequestIdLayer::x_request_id()), - ) + info!(%request_id, "{} {}", request.method(), request.uri()); + }) + .on_response( + |response: &http::Response<_>, latency: Duration, _span: &Span| { + let request_id = response + .headers() + .get(X_REQUEST_ID) + .unwrap() + .to_str() + .unwrap(); + + info!( + %request_id, + code = response.status().as_u16(), + latency = latency.as_millis() + ); + }, + ), + ) + .layer(PropagateRequestIdLayer::x_request_id()), + ) } } @@ -147,15 +167,15 @@ impl Server { match self { // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners // allow binding to localhost - Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), - Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), + Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED), + Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED), } } - fn port(self) -> u16 { + fn port(&self) -> u16 { match self { - Server::Internal(port) => port, - Server::External(port) => port, + Server::Internal { port, .. } => *port, + Server::External { port, .. } => *port, } } @@ -182,7 +202,9 @@ impl Server { ); } - let router = Router::from(self).with_state(compute); + let router = Router::from(&self) + .with_state(compute) + .into_make_service_with_connect_info::(); if let Err(e) = axum::serve(listener, router).await { error!("compute_ctl {} HTTP server error: {}", self, e); @@ -197,15 +219,3 @@ impl Server { tokio::spawn(self.serve(state)); } } - -/// This middleware function allows compute_ctl to generate its own request ID -/// if one isn't supplied. The control plane will always send one as a UUID. The -/// neon Postgres extension on the other hand does not send one. -async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { - let headers = request.headers_mut(); - if headers.get(X_REQUEST_ID).is_none() { - headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); - } - - next.run(request).await -} diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 173dbf40b0..d95c168a99 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -1,8 +1,8 @@ -use compute_api::responses::{InstalledExtension, InstalledExtensions}; use std::collections::HashMap; use anyhow::Result; -use postgres::{Client, NoTls}; +use compute_api::responses::{InstalledExtension, InstalledExtensions}; +use tokio_postgres::{Client, Config, NoTls}; use crate::metrics::INSTALLED_EXTENSIONS; @@ -10,7 +10,7 @@ use crate::metrics::INSTALLED_EXTENSIONS; /// and to make database listing query here more explicit. /// /// Limit the number of databases to 500 to avoid excessive load. -fn list_dbs(client: &mut Client) -> Result> { +async fn list_dbs(client: &mut Client) -> Result> { // `pg_database.datconnlimit = -2` means that the database is in the // invalid state let databases = client @@ -20,7 +20,8 @@ fn list_dbs(client: &mut Client) -> Result> { AND datconnlimit <> - 2 LIMIT 500", &[], - )? + ) + .await? .iter() .map(|row| { let db: String = row.get("datname"); @@ -36,20 +37,36 @@ fn list_dbs(client: &mut Client) -> Result> { /// Same extension can be installed in multiple databases with different versions, /// so we report a separate metric (number of databases where it is installed) /// for each extension version. -pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result { +pub async fn get_installed_extensions(mut conf: Config) -> Result { conf.application_name("compute_ctl:get_installed_extensions"); - let mut client = conf.connect(NoTls)?; - let databases: Vec = list_dbs(&mut client)?; + let databases: Vec = { + let (mut client, connection) = conf.connect(NoTls).await?; + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + list_dbs(&mut client).await? + }; let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new(); for db in databases.iter() { conf.dbname(db); - let mut db_client = conf.connect(NoTls)?; - let extensions: Vec<(String, String, i32)> = db_client + + let (client, connection) = conf.connect(NoTls).await?; + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let extensions: Vec<(String, String, i32)> = client .query( "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension", &[], - )? + ) + .await? .iter() .map(|row| { ( diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index b08df22134..a681fad0b0 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -21,7 +21,9 @@ mod migration; pub mod monitor; pub mod params; pub mod pg_helpers; +pub mod rsyslog; pub mod spec; mod spec_apply; pub mod swap; pub mod sync_sk; +pub mod tls; diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 3749dfc844..c36f302f99 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; +use tracing::info; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::prelude::*; @@ -22,7 +24,8 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result .with_writer(std::io::stderr); // Initialize OpenTelemetry - let otlp_layer = tracing_utils::init_tracing("compute_ctl").await; + let otlp_layer = + tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()).await; // Put it all together tracing_subscriber::registry() @@ -42,3 +45,50 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result pub fn inlinify(s: &str) -> String { s.replace('\n', "\u{200B}") } + +pub fn startup_context_from_env() -> Option { + // Extract OpenTelemetry context for the startup actions from the + // TRACEPARENT and TRACESTATE env variables, and attach it to the current + // tracing context. + // + // This is used to propagate the context for the 'start_compute' operation + // from the neon control plane. This allows linking together the wider + // 'start_compute' operation that creates the compute container, with the + // startup actions here within the container. + // + // There is no standard for passing context in env variables, but a lot of + // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See + // https://github.com/open-telemetry/opentelemetry-specification/issues/740 + // + // Switch to the startup context here, and exit it once the startup has + // completed and Postgres is up and running. + // + // If this pod is pre-created without binding it to any particular endpoint + // yet, this isn't the right place to enter the startup context. In that + // case, the control plane should pass the tracing context as part of the + // /configure API call. + // + // NOTE: This is supposed to only cover the *startup* actions. Once + // postgres is configured and up-and-running, we exit this span. Any other + // actions that are performed on incoming HTTP requests, for example, are + // performed in separate spans. + // + // XXX: If the pod is restarted, we perform the startup actions in the same + // context as the original startup actions, which probably doesn't make + // sense. + let mut startup_tracing_carrier: HashMap = HashMap::new(); + if let Ok(val) = std::env::var("TRACEPARENT") { + startup_tracing_carrier.insert("traceparent".to_string(), val); + } + if let Ok(val) = std::env::var("TRACESTATE") { + startup_tracing_carrier.insert("tracestate".to_string(), val); + } + if !startup_tracing_carrier.is_empty() { + use opentelemetry::propagation::TextMapPropagator; + use opentelemetry_sdk::propagation::TraceContextPropagator; + info!("got startup tracing context from env variables"); + Some(TraceContextPropagator::new().extract(&startup_tracing_carrier)) + } else { + None + } +} diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 3061d387a5..b4ec675ff4 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -1,17 +1,15 @@ -use anyhow::bail; -use anyhow::Result; -use postgres::{NoTls, SimpleQueryMessage}; -use std::time::SystemTime; -use std::{str::FromStr, sync::Arc, thread, time::Duration}; -use utils::id::TenantId; -use utils::id::TimelineId; +use std::str::FromStr; +use std::sync::Arc; +use std::thread; +use std::time::{Duration, SystemTime}; +use anyhow::{Result, bail}; use compute_api::spec::ComputeMode; +use postgres::{NoTls, SimpleQueryMessage}; use tracing::{info, warn}; -use utils::{ - lsn::Lsn, - shard::{ShardCount, ShardNumber, TenantShardId}, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use crate::compute::ComputeNode; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index 870b294d08..4caa48307e 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,6 +1,8 @@ -use metrics::core::Collector; +use metrics::core::{AtomicF64, Collector, GenericGauge}; use metrics::proto::MetricFamily; -use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec}; +use metrics::{ + IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec, +}; use once_cell::sync::Lazy; pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { @@ -54,9 +56,16 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy = Lazy::new(|| register_int_counter_vec!( "compute_ctl_remote_ext_requests_total", "Total number of requests made by compute_ctl to download extensions from S3 proxy by status", - // Do not use any labels like extension name yet. - // We can add them later if needed. - &["http_status"] + &["http_status", "filename"] + ) + .expect("failed to define a metric") +}); + +// Size of audit log directory in bytes +pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy> = Lazy::new(|| { + register_gauge!( + "compute_audit_log_dir_size", + "Size of audit log directory in bytes", ) .expect("failed to define a metric") }); @@ -66,5 +75,6 @@ pub fn collect() -> Vec { metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); metrics.extend(DB_MIGRATION_FAILED.collect()); + metrics.extend(AUDIT_LOG_DIR_SIZE.collect()); metrics } diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 184f380a8d..83318538cd 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -1,13 +1,14 @@ use std::sync::Arc; -use std::{thread, time::Duration}; +use std::thread; +use std::time::Duration; use chrono::{DateTime, Utc}; +use compute_api::responses::ComputeStatus; +use compute_api::spec::ComputeFeature; use postgres::{Client, NoTls}; use tracing::{debug, error, info, warn}; use crate::compute::ComputeNode; -use compute_api::responses::ComputeStatus; -use compute_api::spec::ComputeFeature; const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); @@ -17,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); // should be handled gracefully. fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change - let connstr = compute.connstr.clone(); + let connstr = compute.params.connstr.clone(); let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor")); // During startup and configuration we connect to every Postgres database, diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 86fcf99085..10d8f2c878 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -9,8 +9,11 @@ use std::process::Child; use std::str::FromStr; use std::time::{Duration, Instant}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; +use compute_api::responses::TlsConfig; +use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; use futures::StreamExt; +use indexmap::IndexMap; use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::config::Config; @@ -21,8 +24,6 @@ use tokio_postgres; use tokio_postgres::NoTls; use tracing::{debug, error, info, instrument}; -use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; - const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Escape a string for including it in a SQL literal. @@ -187,15 +188,40 @@ impl DatabaseExt for Database { /// Postgres SQL queries and DATABASE_URL. pub trait Escaping { fn pg_quote(&self) -> String; + fn pg_quote_dollar(&self) -> (String, String); } impl Escaping for PgIdent { /// This is intended to mimic Postgres quote_ident(), but for simplicity it /// always quotes provided string with `""` and escapes every `"`. /// **Not idempotent**, i.e. if string is already escaped it will be escaped again. + /// N.B. it's not useful for escaping identifiers that are used inside WHERE + /// clause, use `escape_literal()` instead. fn pg_quote(&self) -> String { - let result = format!("\"{}\"", self.replace('"', "\"\"")); - result + format!("\"{}\"", self.replace('"', "\"\"")) + } + + /// This helper is intended to be used for dollar-escaping strings for usage + /// inside PL/pgSQL procedures. In addition to dollar-escaping the string, + /// it also returns a tag that is intended to be used inside the outer + /// PL/pgSQL procedure. If you do not need an outer tag, just discard it. + /// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`, + /// + fn pg_quote_dollar(&self) -> (String, String) { + let mut tag: String = "x".to_string(); + let mut outer_tag = "xx".to_string(); + + // Find the first suitable tag that is not present in the string. + // Postgres' max role/DB name length is 63 bytes, so even in the + // worst case it won't take long. + while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) { + tag += "x"; + outer_tag = tag.clone() + "x"; + } + + let escaped = format!("${tag}${self}${tag}$"); + + (escaped, outer_tag) } } @@ -227,10 +253,13 @@ pub async fn get_existing_dbs_async( // invalid state. See: // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 let rowstream = client + // We use a subquery instead of a fancy `datdba::regrole::text AS owner`, + // because the latter automatically wraps the result in double quotes, + // if the role name contains special characters. .query_raw::( "SELECT datname AS name, - datdba::regrole::text AS owner, + (SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner, NOT datallowconn AS restrict_conn, datconnlimit = - 2 AS invalid FROM @@ -379,7 +408,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> { /// Update pgbouncer.ini with provided options fn update_pgbouncer_ini( - pgbouncer_config: HashMap, + pgbouncer_config: IndexMap, pgbouncer_ini_path: &str, ) -> Result<()> { let mut conf = Ini::load_from_file(pgbouncer_ini_path)?; @@ -400,7 +429,10 @@ fn update_pgbouncer_ini( /// Tune pgbouncer. /// 1. Apply new config using pgbouncer admin console /// 2. Add new values to pgbouncer.ini to preserve them after restart -pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result<()> { +pub async fn tune_pgbouncer( + mut pgbouncer_config: IndexMap, + tls_config: Option, +) -> Result<()> { let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() { // for VMs use pgbouncer specific way to connect to // pgbouncer admin console without password @@ -446,19 +478,21 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result } }; - // Apply new config - for (option_name, value) in pgbouncer_config.iter() { - let query = format!("SET {}={}", option_name, value); - // keep this log line for debugging purposes - info!("Applying pgbouncer setting change: {}", query); + if let Some(tls_config) = tls_config { + // pgbouncer starts in a half-ok state if it cannot find these files. + // It will default to client_tls_sslmode=deny, which causes proxy to error. + // There is a small window at startup where these files don't yet exist in the VM. + // Best to wait until it exists. + loop { + if let Ok(true) = tokio::fs::try_exists(&tls_config.key_path).await { + break; + } + tokio::time::sleep(Duration::from_millis(500)).await + } - if let Err(err) = client.simple_query(&query).await { - // Don't fail on error, just print it into log - error!( - "Failed to apply pgbouncer setting change: {}, {}", - query, err - ); - }; + pgbouncer_config.insert("client_tls_cert_file".to_string(), tls_config.cert_path); + pgbouncer_config.insert("client_tls_key_file".to_string(), tls_config.key_path); + pgbouncer_config.insert("client_tls_sslmode".to_string(), "allow".to_string()); } // save values to pgbouncer.ini @@ -474,6 +508,13 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result }; update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?; + info!("Applying pgbouncer setting change"); + + if let Err(err) = client.simple_query("RELOAD").await { + // Don't fail on error, just print it into log + error!("Failed to apply pgbouncer setting change, {err}",); + }; + Ok(()) } diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs new file mode 100644 index 0000000000..80594db3f1 --- /dev/null +++ b/compute_tools/src/rsyslog.rs @@ -0,0 +1,276 @@ +use std::fs; +use std::io::ErrorKind; +use std::path::Path; +use std::process::Command; +use std::time::Duration; +use std::{fs::OpenOptions, io::Write}; + +use anyhow::{Context, Result, anyhow}; +use tracing::{error, info, instrument, warn}; + +const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf"; + +fn get_rsyslog_pid() -> Option { + let output = Command::new("pgrep") + .arg("rsyslogd") + .output() + .expect("Failed to execute pgrep"); + + if !output.stdout.is_empty() { + let pid = std::str::from_utf8(&output.stdout) + .expect("Invalid UTF-8 in process output") + .trim() + .to_string(); + Some(pid) + } else { + None + } +} + +// Restart rsyslogd to apply the new configuration. +// This is necessary, because there is no other way to reload the rsyslog configuration. +// +// Rsyslogd shouldn't lose any messages, because of the restart, +// because it tracks the last read position in the log files +// and will continue reading from that position. +// TODO: test it properly +// +fn restart_rsyslog() -> Result<()> { + let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?; + info!("rsyslogd is running with pid: {}, restart it", old_pid); + + // kill it to restart + let _ = Command::new("pkill") + .arg("rsyslogd") + .output() + .context("Failed to stop rsyslogd")?; + + Ok(()) +} + +pub fn configure_audit_rsyslog( + log_directory: String, + tag: &str, + remote_endpoint: &str, +) -> Result<()> { + let config_content: String = format!( + include_str!("config_template/compute_audit_rsyslog_template.conf"), + log_directory = log_directory, + tag = tag, + remote_endpoint = remote_endpoint + ); + + info!("rsyslog config_content: {}", config_content); + + let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf"; + let mut file = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(rsyslog_conf_path)?; + + file.write_all(config_content.as_bytes())?; + + info!( + "rsyslog configuration file {} added successfully. Starting rsyslogd", + rsyslog_conf_path + ); + + // start the service, using the configuration + restart_rsyslog()?; + + Ok(()) +} + +/// Configuration for enabling Postgres logs forwarding from rsyslogd +pub struct PostgresLogsRsyslogConfig<'a> { + pub host: Option<&'a str>, +} + +impl<'a> PostgresLogsRsyslogConfig<'a> { + pub fn new(host: Option<&'a str>) -> Self { + Self { host } + } + + pub fn build(&self) -> Result { + match self.host { + Some(host) => { + if let Some((target, port)) = host.split_once(":") { + Ok(format!( + include_str!( + "config_template/compute_rsyslog_postgres_export_template.conf" + ), + logs_export_target = target, + logs_export_port = port, + )) + } else { + Err(anyhow!("Invalid host format for Postgres logs export")) + } + } + None => Ok("".to_string()), + } + } + + fn current_config() -> Result { + let config_content = match std::fs::read_to_string(POSTGRES_LOGS_CONF_PATH) { + Ok(c) => c, + Err(err) if err.kind() == ErrorKind::NotFound => String::new(), + Err(err) => return Err(err.into()), + }; + Ok(config_content) + } + + /// Returns the default host for otel collector that receives Postgres logs + pub fn default_host(project_id: &str) -> String { + format!( + "config-{}-collector.neon-telemetry.svc.cluster.local:10514", + project_id + ) + } +} + +pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> { + let new_config = conf.build()?; + let current_config = PostgresLogsRsyslogConfig::current_config()?; + + if new_config == current_config { + info!("postgres logs rsyslog configuration is up-to-date"); + return Ok(()); + } + + // When new config is empty we can simply remove the configuration file. + if new_config.is_empty() { + info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH); + match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) { + Ok(_) => {} + Err(err) if err.kind() == ErrorKind::NotFound => {} + Err(err) => return Err(err.into()), + } + restart_rsyslog()?; + return Ok(()); + } + + info!( + "configuring rsyslog for postgres logs export to: {:?}", + conf.host + ); + + let mut file = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(POSTGRES_LOGS_CONF_PATH)?; + file.write_all(new_config.as_bytes())?; + + info!( + "rsyslog configuration file {} added successfully. Starting rsyslogd", + POSTGRES_LOGS_CONF_PATH + ); + + restart_rsyslog()?; + Ok(()) +} + +#[instrument(skip_all)] +async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> { + info!("running pgaudit GC main loop"); + loop { + // Check log_directory for old pgaudit logs and delete them. + // New log files are checked every 5 minutes, as set in pgaudit.log_rotation_age + // Find files that were not modified in the last 15 minutes and delete them. + // This should be enough time for rsyslog to process the logs and for us to catch the alerts. + // + // In case of a very high load, we might need to adjust this value and pgaudit.log_rotation_age. + // + // TODO: add some smarter logic to delete the files that are fully streamed according to rsyslog + // imfile-state files, but for now just do a simple GC to avoid filling up the disk. + let _ = Command::new("find") + .arg(&log_directory) + .arg("-name") + .arg("audit*.log") + .arg("-mmin") + .arg("+15") + .arg("-delete") + .output()?; + + // also collect the metric for the size of the log directory + async fn get_log_files_size(path: &Path) -> Result { + let mut total_size = 0; + + for entry in fs::read_dir(path)? { + let entry = entry?; + let entry_path = entry.path(); + + if entry_path.is_file() && entry_path.to_string_lossy().ends_with("log") { + total_size += entry.metadata()?.len(); + } + } + + Ok(total_size) + } + + let log_directory_size = get_log_files_size(Path::new(&log_directory)) + .await + .unwrap_or_else(|e| { + warn!("Failed to get log directory size: {}", e); + 0 + }); + crate::metrics::AUDIT_LOG_DIR_SIZE.set(log_directory_size as f64); + tokio::time::sleep(Duration::from_secs(60)).await; + } +} + +// launch pgaudit GC thread to clean up the old pgaudit logs stored in the log_directory +pub fn launch_pgaudit_gc(log_directory: String) { + tokio::spawn(async move { + if let Err(e) = pgaudit_gc_main_loop(log_directory).await { + error!("pgaudit GC main loop failed: {}", e); + } + }); +} + +#[cfg(test)] +mod tests { + use crate::rsyslog::PostgresLogsRsyslogConfig; + + #[test] + fn test_postgres_logs_config() { + { + // Verify empty config + let conf = PostgresLogsRsyslogConfig::new(None); + let res = conf.build(); + assert!(res.is_ok()); + let conf_str = res.unwrap(); + assert_eq!(&conf_str, ""); + } + + { + // Verify config + let conf = PostgresLogsRsyslogConfig::new(Some("collector.cvc.local:514")); + let res = conf.build(); + assert!(res.is_ok()); + let conf_str = res.unwrap(); + assert!(conf_str.contains("omfwd")); + assert!(conf_str.contains(r#"target="collector.cvc.local""#)); + assert!(conf_str.contains(r#"port="514""#)); + } + + { + // Verify invalid config + let conf = PostgresLogsRsyslogConfig::new(Some("invalid")); + let res = conf.build(); + assert!(res.is_err()); + } + + { + // Verify config with default host + let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123"); + let conf = PostgresLogsRsyslogConfig::new(Some(&host)); + let res = conf.build(); + assert!(res.is_ok()); + let conf_str = res.unwrap(); + assert!(conf_str.contains(r#"shy-breeze-123"#)); + assert!(conf_str.contains(r#"port="10514""#)); + } + } +} diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 1145dcd932..6db50e5f09 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,19 +1,20 @@ -use anyhow::{anyhow, bail, Result}; -use reqwest::StatusCode; use std::fs::File; use std::path::Path; -use tokio_postgres::Client; -use tracing::{error, info, instrument}; - -use crate::config; -use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; -use crate::migration::MigrationRunner; -use crate::params::PG_HBA_ALL_MD5; +use anyhow::{Result, anyhow, bail}; use compute_api::responses::{ ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse, }; use compute_api::spec::ComputeSpec; +use reqwest::StatusCode; +use tokio_postgres::Client; +use tracing::{error, info, instrument, warn}; + +use crate::config; +use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS}; +use crate::migration::MigrationRunner; +use crate::params::PG_HBA_ALL_MD5; +use crate::pg_helpers::*; // Do control plane request and return response if any. In case of error it // returns a bool flag indicating whether it makes sense to retry the request @@ -140,7 +141,6 @@ pub fn get_spec_from_control_plane( /// Check `pg_hba.conf` and update if needed to allow external connections. pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of spec.json - info!("checking pg_hba.conf"); let pghba_path = pgdata_path.join("pg_hba.conf"); if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? { @@ -155,12 +155,11 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { /// Create a standby.signal file pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of spec.json - info!("adding standby.signal"); let signalfile = pgdata_path.join("standby.signal"); if !signalfile.exists() { - info!("created standby.signal"); File::create(signalfile)?; + info!("created standby.signal"); } else { info!("reused pre-existing standby.signal"); } @@ -169,7 +168,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { #[instrument(skip_all)] pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { - info!("handle neon extension upgrade"); let query = "ALTER EXTENSION neon UPDATE"; info!("update neon extension version with query: {}", query); client.simple_query(query).await?; diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 9fd78155b2..0e2adc8b88 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -1,18 +1,429 @@ use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::future::Future; -use std::iter::empty; -use std::iter::once; +use std::iter::{empty, once}; use std::sync::Arc; -use crate::compute::construct_superuser_query; -use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt}; -use anyhow::Result; -use compute_api::spec::{ComputeSpec, Database, PgIdent, Role}; +use anyhow::{Context, Result}; +use compute_api::responses::ComputeStatus; +use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role}; use futures::future::join_all; use tokio::sync::RwLock; use tokio_postgres::Client; -use tracing::{debug, info_span, warn, Instrument}; +use tokio_postgres::error::SqlState; +use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; + +use crate::compute::{ComputeNode, ComputeState}; +use crate::pg_helpers::{ + DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async, + get_existing_roles_async, +}; +use crate::spec_apply::ApplySpecPhase::{ + CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser, + CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon, + DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, + HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, + RunInEachDatabase, +}; +use crate::spec_apply::PerDatabasePhase::{ + ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, +}; + +impl ComputeNode { + /// Apply the spec to the running PostgreSQL instance. + /// The caller can decide to run with multiple clients in parallel, or + /// single mode. Either way, the commands executed will be the same, and + /// only commands run in different databases are parallelized. + #[instrument(skip_all)] + pub fn apply_spec_sql( + &self, + spec: Arc, + conf: Arc, + concurrency: usize, + ) -> Result<()> { + info!("Applying config with max {} concurrency", concurrency); + debug!("Config: {:?}", spec); + + let rt = tokio::runtime::Handle::current(); + rt.block_on(async { + // Proceed with post-startup configuration. Note, that order of operations is important. + let client = Self::get_maintenance_client(&conf).await?; + let spec = spec.clone(); + + let databases = get_existing_dbs_async(&client).await?; + let roles = get_existing_roles_async(&client) + .await? + .into_iter() + .map(|role| (role.name.clone(), role)) + .collect::>(); + + // Check if we need to drop subscriptions before starting the endpoint. + // + // It is important to do this operation exactly once when endpoint starts on a new branch. + // Otherwise, we may drop not inherited, but newly created subscriptions. + // + // We cannot rely only on spec.drop_subscriptions_before_start flag, + // because if for some reason compute restarts inside VM, + // it will start again with the same spec and flag value. + // + // To handle this, we save the fact of the operation in the database + // in the neon.drop_subscriptions_done table. + // If the table does not exist, we assume that the operation was never performed, so we must do it. + // If table exists, we check if the operation was performed on the current timelilne. + // + let mut drop_subscriptions_done = false; + + if spec.drop_subscriptions_before_start { + let timeline_id = self.get_timeline_id().context("timeline_id must be set")?; + let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id); + + info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); + + drop_subscriptions_done = match + client.simple_query(&query).await { + Ok(result) => { + matches!(&result[0], postgres::SimpleQueryMessage::Row(_)) + }, + Err(e) => + { + match e.code() { + Some(&SqlState::UNDEFINED_TABLE) => false, + _ => { + // We don't expect any other error here, except for the schema/table not existing + error!("Error checking if drop subscription operation was already performed: {}", e); + return Err(e.into()); + } + } + } + } + }; + + + let jwks_roles = Arc::new( + spec.as_ref() + .local_proxy_config + .iter() + .flat_map(|it| &it.jwks) + .flatten() + .flat_map(|setting| &setting.role_names) + .cloned() + .collect::>(), + ); + + let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext { + roles, + dbs: databases, + })); + + // Apply special pre drop database phase. + // NOTE: we use the code of RunInEachDatabase phase for parallelism + // and connection management, but we don't really run it in *each* database, + // only in databases, we're about to drop. + info!("Applying PerDatabase (pre-dropdb) phase"); + let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + // Run the phase for each database that we're about to drop. + let db_processes = spec + .delta_operations + .iter() + .flatten() + .filter_map(move |op| { + if op.action.as_str() == "delete_db" { + Some(op.name.clone()) + } else { + None + } + }) + .map(|dbname| { + let spec = spec.clone(); + let ctx = ctx.clone(); + let jwks_roles = jwks_roles.clone(); + let mut conf = conf.as_ref().clone(); + let concurrency_token = concurrency_token.clone(); + // We only need dbname field for this phase, so set other fields to dummy values + let db = DB::UserDB(Database { + name: dbname.clone(), + owner: "cloud_admin".to_string(), + options: None, + restrict_conn: false, + invalid: false, + }); + + debug!("Applying per-database phases for Database {:?}", &db); + + match &db { + DB::SystemDB => {} + DB::UserDB(db) => { + conf.dbname(db.name.as_str()); + } + } + + let conf = Arc::new(conf); + let fut = Self::apply_spec_sql_db( + spec.clone(), + conf, + ctx.clone(), + jwks_roles.clone(), + concurrency_token.clone(), + db, + [DropLogicalSubscriptions].to_vec(), + ); + + Ok(tokio::spawn(fut)) + }) + .collect::>>(); + + for process in db_processes.into_iter() { + let handle = process?; + if let Err(e) = handle.await? { + // Handle the error case where the database does not exist + // We do not check whether the DB exists or not in the deletion phase, + // so we shouldn't be strict about it in pre-deletion cleanup as well. + if e.to_string().contains("does not exist") { + warn!("Error dropping subscription: {}", e); + } else { + return Err(e); + } + }; + } + + for phase in [ + CreateNeonSuperuser, + DropInvalidDatabases, + RenameRoles, + CreateAndAlterRoles, + RenameAndDeleteDatabases, + CreateAndAlterDatabases, + CreateSchemaNeon, + ] { + info!("Applying phase {:?}", &phase); + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + phase, + || async { Ok(&client) }, + ) + .await?; + } + + info!("Applying RunInEachDatabase2 phase"); + let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + let db_processes = spec + .cluster + .databases + .iter() + .map(|db| DB::new(db.clone())) + // include + .chain(once(DB::SystemDB)) + .map(|db| { + let spec = spec.clone(); + let ctx = ctx.clone(); + let jwks_roles = jwks_roles.clone(); + let mut conf = conf.as_ref().clone(); + let concurrency_token = concurrency_token.clone(); + let db = db.clone(); + + debug!("Applying per-database phases for Database {:?}", &db); + + match &db { + DB::SystemDB => {} + DB::UserDB(db) => { + conf.dbname(db.name.as_str()); + } + } + + let conf = Arc::new(conf); + let mut phases = vec![ + DeleteDBRoleReferences, + ChangeSchemaPerms, + ]; + + if spec.drop_subscriptions_before_start && !drop_subscriptions_done { + info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); + phases.push(DropLogicalSubscriptions); + } + + let fut = Self::apply_spec_sql_db( + spec.clone(), + conf, + ctx.clone(), + jwks_roles.clone(), + concurrency_token.clone(), + db, + phases, + ); + + Ok(tokio::spawn(fut)) + }) + .collect::>>(); + + for process in db_processes.into_iter() { + let handle = process?; + handle.await??; + } + + let mut phases = vec![ + HandleOtherExtensions, + HandleNeonExtension, // This step depends on CreateSchemaNeon + CreateAvailabilityCheck, + DropRoles, + ]; + + // This step depends on CreateSchemaNeon + if spec.drop_subscriptions_before_start && !drop_subscriptions_done { + info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); + phases.push(FinalizeDropLogicalSubscriptions); + } + + // Keep DisablePostgresDBPgAudit phase at the end, + // so that all config operations are audit logged. + match spec.audit_log_level + { + ComputeAudit::Hipaa => { + phases.push(CreatePgauditExtension); + phases.push(CreatePgauditlogtofileExtension); + phases.push(DisablePostgresDBPgAudit); + } + ComputeAudit::Log => { /* not implemented yet */ } + ComputeAudit::Disabled => {} + } + + for phase in phases { + debug!("Applying phase {:?}", &phase); + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + phase, + || async { Ok(&client) }, + ) + .await?; + } + + Ok::<(), anyhow::Error>(()) + })?; + + Ok(()) + } + + /// Apply SQL migrations of the RunInEachDatabase phase. + /// + /// May opt to not connect to databases that don't have any scheduled + /// operations. The function is concurrency-controlled with the provided + /// semaphore. The caller has to make sure the semaphore isn't exhausted. + async fn apply_spec_sql_db( + spec: Arc, + conf: Arc, + ctx: Arc>, + jwks_roles: Arc>, + concurrency_token: Arc, + db: DB, + subphases: Vec, + ) -> Result<()> { + let _permit = concurrency_token.acquire().await?; + + let mut client_conn = None; + + for subphase in subphases { + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + RunInEachDatabase { + db: db.clone(), + subphase, + }, + // Only connect if apply_operation actually wants a connection. + // It's quite possible this database doesn't need any queries, + // so by not connecting we save time and effort connecting to + // that database. + || async { + if client_conn.is_none() { + let db_client = Self::get_maintenance_client(&conf).await?; + client_conn.replace(db_client); + } + let client = client_conn.as_ref().unwrap(); + Ok(client) + }, + ) + .await?; + } + + drop(client_conn); + + Ok::<(), anyhow::Error>(()) + } + + /// Choose how many concurrent connections to use for applying the spec changes. + pub fn max_service_connections( + &self, + compute_state: &ComputeState, + spec: &ComputeSpec, + ) -> usize { + // If the cluster is in Init state we don't have to deal with user connections, + // and can thus use all `max_connections` connection slots. However, that's generally not + // very efficient, so we generally still limit it to a smaller number. + if compute_state.status == ComputeStatus::Init { + // If the settings contain 'max_connections', use that as template + if let Some(config) = spec.cluster.settings.find("max_connections") { + config.parse::().ok() + } else { + // Otherwise, try to find the setting in the postgresql_conf string + spec.cluster + .postgresql_conf + .iter() + .flat_map(|conf| conf.split("\n")) + .filter_map(|line| { + if !line.contains("max_connections") { + return None; + } + + let (key, value) = line.split_once("=")?; + let key = key + .trim_start_matches(char::is_whitespace) + .trim_end_matches(char::is_whitespace); + + let value = value + .trim_start_matches(char::is_whitespace) + .trim_end_matches(char::is_whitespace); + + if key != "max_connections" { + return None; + } + + value.parse::().ok() + }) + .next() + } + // If max_connections is present, use at most 1/3rd of that. + // When max_connections is lower than 30, try to use at least 10 connections, but + // never more than max_connections. + .map(|limit| match limit { + 0..10 => limit, + 10..30 => 10, + 30.. => limit / 3, + }) + // If we didn't find max_connections, default to 10 concurrent connections. + .unwrap_or(10) + } else { + // state == Running + // Because the cluster is already in the Running state, we should assume users are + // already connected to the cluster, and high concurrency could negatively + // impact user connectivity. Therefore, we can limit concurrency to the number of + // reserved superuser connections, which users wouldn't be able to use anyway. + spec.cluster + .settings + .find("superuser_reserved_connections") + .iter() + .filter_map(|val| val.parse::().ok()) + .map(|val| if val > 1 { val - 1 } else { 1 }) + .last() + .unwrap_or(3) + } + } +} #[derive(Clone)] pub enum DB { @@ -56,7 +467,7 @@ pub enum PerDatabasePhase { #[derive(Clone, Debug)] pub enum ApplySpecPhase { - CreateSuperUser, + CreateNeonSuperuser, DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, @@ -64,6 +475,9 @@ pub enum ApplySpecPhase { CreateAndAlterDatabases, CreateSchemaNeon, RunInEachDatabase { db: DB, subphase: PerDatabasePhase }, + CreatePgauditExtension, + CreatePgauditlogtofileExtension, + DisablePostgresDBPgAudit, HandleOtherExtensions, HandleNeonExtension, CreateAvailabilityCheck, @@ -180,14 +594,10 @@ async fn get_operations<'a>( apply_spec_phase: &'a ApplySpecPhase, ) -> Result + 'a + Send>> { match apply_spec_phase { - ApplySpecPhase::CreateSuperUser => { - let query = construct_superuser_query(spec); - - Ok(Box::new(once(Operation { - query, - comment: None, - }))) - } + ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation { + query: include_str!("sql/create_neon_superuser.sql").to_string(), + comment: None, + }))), ApplySpecPhase::DropInvalidDatabases => { let mut ctx = ctx.write().await; let databases = &mut ctx.dbs; @@ -321,14 +731,15 @@ async fn get_operations<'a>( // We do not check whether the DB exists or not, // Postgres will take care of it for us "delete_db" => { + let (db_name, outer_tag) = op.name.pg_quote_dollar(); // In Postgres we can't drop a database if it is a template. // So we need to unset the template flag first, but it could // be a retry, so we could've already dropped the database. // Check that database exists first to make it idempotent. let unset_template_query: String = format!( include_str!("sql/unset_template_for_drop_dbs.sql"), - datname_str = escape_literal(&op.name), - datname = &op.name.pg_quote() + datname = db_name, + outer_tag = outer_tag, ); // Use FORCE to drop database even if there are active connections. @@ -435,6 +846,8 @@ async fn get_operations<'a>( comment: None, }, Operation { + // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database + // (see https://www.postgresql.org/docs/current/ddl-priv.html) query: format!( "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser", db.name.pg_quote() @@ -473,7 +886,10 @@ async fn get_operations<'a>( let edb = match databases.get(&db.name) { Some(edb) => edb, None => { - warn!("skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", subphase, db.name); + warn!( + "skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", + subphase, db.name + ); return Ok(Box::new(empty())); } }; @@ -491,9 +907,11 @@ async fn get_operations<'a>( PerDatabasePhase::DropLogicalSubscriptions => { match &db { DB::UserDB(db) => { + let (db_name, outer_tag) = db.name.pg_quote_dollar(); let drop_subscription_query: String = format!( include_str!("sql/drop_subscriptions.sql"), - datname_str = escape_literal(&db.name), + datname_str = db_name, + outer_tag = outer_tag, ); let operations = vec![Operation { @@ -532,6 +950,7 @@ async fn get_operations<'a>( DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(), DB::UserDB(db) => db.owner.pg_quote(), }; + let (escaped_role, outer_tag) = op.name.pg_quote_dollar(); Some(vec![ // This will reassign all dependent objects to the db owner @@ -546,7 +965,9 @@ async fn get_operations<'a>( Operation { query: format!( include_str!("sql/pre_drop_role_revoke_privileges.sql"), - role_name = quoted, + // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` + role_name = escaped_role, + outer_tag = outer_tag, ), comment: None, }, @@ -571,12 +992,14 @@ async fn get_operations<'a>( DB::SystemDB => return Ok(Box::new(empty())), DB::UserDB(db) => db, }; + let (db_owner, outer_tag) = db.owner.pg_quote_dollar(); let operations = vec![ Operation { query: format!( include_str!("sql/set_public_schema_owner.sql"), - db_owner = db.owner.pg_quote() + db_owner = db_owner, + outer_tag = outer_tag, ), comment: None, }, @@ -604,6 +1027,25 @@ async fn get_operations<'a>( } Ok(Box::new(empty())) } + ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"), + comment: Some(String::from("create pgaudit extensions")), + }))), + ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"), + comment: Some(String::from("create pgauditlogtofile extensions")), + }))), + // Disable pgaudit logging for postgres database. + // Postgres is neon system database used by monitors + // and compute_ctl tuning functions and thus generates a lot of noise. + // We do not consider data stored in this database as sensitive. + ApplySpecPhase::DisablePostgresDBPgAudit => { + let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'"; + Ok(Box::new(once(Operation { + query: query.to_string(), + comment: Some(query.to_string()), + }))) + } ApplySpecPhase::HandleNeonExtension => { let operations = vec![ Operation { diff --git a/compute_tools/src/sql/create_neon_superuser.sql b/compute_tools/src/sql/create_neon_superuser.sql new file mode 100644 index 0000000000..300645627b --- /dev/null +++ b/compute_tools/src/sql/create_neon_superuser.sql @@ -0,0 +1,8 @@ +DO $$ + BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser') + THEN + CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data; + END IF; + END +$$; diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql index 03e8e158fa..f5d9420130 100644 --- a/compute_tools/src/sql/drop_subscriptions.sql +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -1,4 +1,4 @@ -DO $$ +DO ${outer_tag}$ DECLARE subname TEXT; BEGIN @@ -9,4 +9,4 @@ BEGIN EXECUTE format('DROP SUBSCRIPTION %I;', subname); END LOOP; END; -$$; +${outer_tag}$; diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql index cdaa7071d3..734607be02 100644 --- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql +++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql @@ -1,8 +1,7 @@ -SET SESSION ROLE neon_superuser; - -DO $$ +DO ${outer_tag}$ DECLARE schema TEXT; + grantor TEXT; revoke_query TEXT; BEGIN FOR schema IN @@ -15,14 +14,25 @@ BEGIN -- ii) it's easy to add more schemas to the list if needed. WHERE schema_name IN ('public') LOOP - revoke_query := format( - 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;', - schema - ); + FOR grantor IN EXECUTE + format( + 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s', + -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` + quote_literal({role_name}) + ) + LOOP + EXECUTE format('SET LOCAL ROLE %I', grantor); - EXECUTE revoke_query; + revoke_query := format( + 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I', + schema, + -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` + {role_name}, + grantor + ); + + EXECUTE revoke_query; + END LOOP; END LOOP; END; -$$; - -RESET ROLE; +${outer_tag}$; diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql index fd061a713e..dc502c6d2d 100644 --- a/compute_tools/src/sql/set_public_schema_owner.sql +++ b/compute_tools/src/sql/set_public_schema_owner.sql @@ -1,5 +1,4 @@ -DO -$$ +DO ${outer_tag}$ DECLARE schema_owner TEXT; BEGIN @@ -16,8 +15,8 @@ $$ IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin' THEN - ALTER SCHEMA public OWNER TO {db_owner}; + EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner}); END IF; END IF; END -$$; \ No newline at end of file +${outer_tag}$; \ No newline at end of file diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql index 6c4343a589..36dc648beb 100644 --- a/compute_tools/src/sql/unset_template_for_drop_dbs.sql +++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql @@ -1,12 +1,12 @@ -DO $$ +DO ${outer_tag}$ BEGIN IF EXISTS( SELECT 1 FROM pg_catalog.pg_database - WHERE datname = {datname_str} + WHERE datname = {datname} ) THEN - ALTER DATABASE {datname} is_template false; + EXECUTE format('ALTER DATABASE %I is_template false', {datname}); END IF; END -$$; \ No newline at end of file +${outer_tag}$; diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs index 024c5b338e..ed27a7cba4 100644 --- a/compute_tools/src/swap.rs +++ b/compute_tools/src/swap.rs @@ -1,10 +1,11 @@ use std::path::Path; -use anyhow::{anyhow, Context}; -use tracing::warn; +use anyhow::{Context, anyhow}; +use tracing::{instrument, warn}; pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap"; +#[instrument] pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { // run `/neonvm/bin/resize-swap --once {size_bytes}` // diff --git a/compute_tools/src/tls.rs b/compute_tools/src/tls.rs new file mode 100644 index 0000000000..5a310d8ac4 --- /dev/null +++ b/compute_tools/src/tls.rs @@ -0,0 +1,118 @@ +use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration}; + +use anyhow::{Context, Result, bail}; +use compute_api::responses::TlsConfig; +use ring::digest; +use spki::ObjectIdentifier; +use spki::der::{Decode, PemReader}; +use x509_cert::Certificate; + +#[derive(Clone, Copy)] +pub struct CertDigest(digest::Digest); + +pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver { + let mut digest = compute_digest(&cert_path).await; + let (tx, rx) = tokio::sync::watch::channel(digest); + tokio::spawn(async move { + while !tx.is_closed() { + let new_digest = compute_digest(&cert_path).await; + if digest.0.as_ref() != new_digest.0.as_ref() { + digest = new_digest; + _ = tx.send(digest); + } + + tokio::time::sleep(Duration::from_secs(60)).await + } + }); + rx +} + +async fn compute_digest(cert_path: &str) -> CertDigest { + loop { + match try_compute_digest(cert_path).await { + Ok(d) => break d, + Err(e) => { + tracing::error!("could not read cert file {e:?}"); + tokio::time::sleep(Duration::from_secs(1)).await + } + } + } +} + +async fn try_compute_digest(cert_path: &str) -> Result { + let data = tokio::fs::read(cert_path).await?; + // sha256 is extremely collision resistent. can safely assume the digest to be unique + Ok(CertDigest(digest::digest(&digest::SHA256, &data))) +} + +pub const SERVER_CRT: &str = "server.crt"; +pub const SERVER_KEY: &str = "server.key"; + +pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) { + loop { + match try_update_key_path_blocking(pg_data, tls_config) { + Ok(()) => break, + Err(e) => { + tracing::error!("could not create key file {e:?}"); + std::thread::sleep(Duration::from_secs(1)) + } + } + } +} + +// Postgres requires the keypath be "secure". This means +// 1. Owned by the postgres user. +// 2. Have permission 600. +fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> { + let key = std::fs::read_to_string(&tls_config.key_path)?; + let crt = std::fs::read_to_string(&tls_config.cert_path)?; + + // to mitigate a race condition during renewal. + verify_key_cert(&key, &crt)?; + + let mut key_file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .mode(0o600) + .open(pg_data.join(SERVER_KEY))?; + + let mut crt_file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .mode(0o600) + .open(pg_data.join(SERVER_CRT))?; + + key_file.write_all(key.as_bytes())?; + crt_file.write_all(crt.as_bytes())?; + + Ok(()) +} + +fn verify_key_cert(key: &str, cert: &str) -> Result<()> { + const ECDSA_WITH_SHA256: ObjectIdentifier = ObjectIdentifier::new_unwrap("1.2.840.10045.4.3.2"); + + let cert = Certificate::decode(&mut PemReader::new(cert.as_bytes()).context("pem reader")?) + .context("decode cert")?; + + match cert.signature_algorithm.oid { + ECDSA_WITH_SHA256 => { + let key = p256::SecretKey::from_sec1_pem(key).context("parse key")?; + + let a = key.public_key().to_sec1_bytes(); + let b = cert + .tbs_certificate + .subject_public_key_info + .subject_public_key + .raw_bytes(); + + if *a != *b { + bail!("private key file does not match certificate") + } + } + _ => bail!("unknown TLS key type"), + } + + Ok(()) +} diff --git a/compute_tools/tests/config_test.rs b/compute_tools/tests/config_test.rs index 9ab16b1930..7b2bff23d5 100644 --- a/compute_tools/tests/config_test.rs +++ b/compute_tools/tests/config_test.rs @@ -1,7 +1,7 @@ #[cfg(test)] mod config_tests { - use std::fs::{remove_file, File}; + use std::fs::{File, remove_file}; use std::io::{Read, Write}; use std::path::Path; diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 4961bc293d..b72c1293ee 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -61,6 +61,24 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\""); } + #[test] + fn ident_pg_quote_dollar() { + let test_cases = vec![ + ("name", ("$x$name$x$", "xx")), + ("name$", ("$x$name$$x$", "xx")), + ("name$$", ("$x$name$$$x$", "xx")), + ("name$$$", ("$x$name$$$$x$", "xx")), + ("name$$$$", ("$x$name$$$$$x$", "xx")), + ("name$x$", ("$xx$name$x$$xx$", "xxx")), + ]; + + for (input, expected) in test_cases { + let (escaped, tag) = PgIdent::from(input).pg_quote_dollar(); + assert_eq!(escaped, expected.0); + assert_eq!(tag, expected.1); + } + } + #[test] fn generic_options_search() { let generic_options: GenericOptions = Some(vec![ diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index c668e68402..1eac4f7ff0 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -25,7 +25,7 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use nix::errno::Errno; use nix::fcntl::{FcntlArg, FdFlag}; -use nix::sys::signal::{kill, Signal}; +use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use utils::pid_file::{self, PidFileRead}; diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 02d793400a..747268f80b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -5,7 +5,16 @@ //! easier to work with locally. The python tests in `test_runner` //! rely on `neon_local` to set up the environment for each test. //! -use anyhow::{anyhow, bail, Context, Result}; +use std::borrow::Cow; +use std::collections::{BTreeSet, HashMap}; +use std::fs::File; +use std::os::fd::AsRawFd; +use std::path::PathBuf; +use std::process::exit; +use std::str::FromStr; +use std::time::Duration; + +use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; @@ -19,7 +28,7 @@ use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; use control_plane::{broker, local_env}; -use nix::fcntl::{flock, FlockArg}; +use nix::fcntl::{FlockArg, flock}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, @@ -27,31 +36,24 @@ use pageserver_api::config::{ use pageserver_api::controller_api::{ NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest, }; -use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{ + ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, +}; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; +use safekeeper_api::membership::SafekeeperGeneration; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; -use std::borrow::Cow; -use std::collections::{BTreeSet, HashMap}; -use std::fs::File; -use std::os::fd::AsRawFd; -use std::path::PathBuf; -use std::process::exit; -use std::str::FromStr; -use std::time::Duration; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use tokio::task::JoinSet; use url::Host; -use utils::{ - auth::{Claims, Scope}, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, - project_git_version, -}; +use utils::auth::{Claims, Scope}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use utils::project_git_version; // Default id of a safekeeper node, if not specified on the command line. const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); @@ -597,7 +599,15 @@ struct EndpointStartCmdArgs { #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, - #[clap(long)] + #[clap( + long, + help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations." + )] + safekeepers_generation: Option, + #[clap( + long, + help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override." + )] safekeepers: Option, #[clap( @@ -618,9 +628,9 @@ struct EndpointStartCmdArgs { )] allow_multiple: bool, - #[clap(short = 't', long, help = "timeout until we fail the command")] - #[arg(default_value = "10s")] - start_timeout: humantime::Duration, + #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")] + #[arg(default_value = "90s")] + start_timeout: Duration, } #[derive(clap::Args)] @@ -887,20 +897,6 @@ fn print_timeline( Ok(()) } -/// Returns a map of timeline IDs to timeline_id@lsn strings. -/// Connects to the pageserver to query this information. -async fn get_timeline_infos( - env: &local_env::LocalEnv, - tenant_shard_id: &TenantShardId, -) -> Result> { - Ok(get_default_pageserver(env) - .timeline_list(tenant_shard_id) - .await? - .into_iter() - .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) - .collect()) -} - /// Helper function to get tenant id from an optional --tenant_id option or from the config file fn get_tenant_id( tenant_id_arg: Option, @@ -935,7 +931,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { let init_conf: NeonLocalInitConf = if let Some(config_path) = &args.config { // User (likely the Python test suite) provided a description of the environment. if args.num_pageservers.is_some() { - bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); + bail!( + "Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead" + ); } // load and parse the file let contents = std::fs::read_to_string(config_path).with_context(|| { @@ -967,6 +965,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { id: pageserver_id, listen_pg_addr: format!("127.0.0.1:{pg_port}"), listen_http_addr: format!("127.0.0.1:{http_port}"), + listen_https_addr: None, pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, other: Default::default(), @@ -980,7 +979,8 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { neon_distrib_dir: None, default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), storage_controller: None, - control_plane_compute_hook_api: None, + control_plane_hooks_api: None, + generate_local_ssl_certs: false, } }; @@ -1131,12 +1131,16 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any let tenant_id = get_tenant_id(args.tenant_id, env)?; let tenant_conf: HashMap<_, _> = args.config.iter().flat_map(|c| c.split_once(':')).collect(); + let config = PageServerNode::parse_config(tenant_conf)?; - pageserver - .tenant_config(tenant_id, tenant_conf) + let req = TenantConfigRequest { tenant_id, config }; + + let storage_controller = StorageController::from_env(env); + storage_controller + .set_tenant_config(&req) .await .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; - println!("tenant {tenant_id} successfully configured on the pageserver"); + println!("tenant {tenant_id} successfully configured via storcon"); } } Ok(()) @@ -1251,12 +1255,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?; - let timeline_infos = get_timeline_infos(env, &tenant_shard_id) - .await - .unwrap_or_else(|e| { - eprintln!("Failed to load timeline info: {}", e); - HashMap::new() - }); let timeline_name_mappings = env.timeline_name_mappings(); @@ -1285,12 +1283,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res lsn.to_string() } _ => { - // -> primary endpoint or hot replica - // Use the LSN at the end of the timeline. - timeline_infos - .get(&endpoint.timeline_id) - .map(|bi| bi.last_record_lsn.to_string()) - .unwrap_or_else(|| "?".to_string()) + // As the LSN here refers to the one that the compute is started with, + // we display nothing as it is a primary/hot standby compute. + "---".to_string() } }; @@ -1338,10 +1333,14 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res match (mode, args.hot_standby) { (ComputeMode::Static(_), true) => { - bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") + bail!( + "Cannot start a node in hot standby mode when it is already configured as a static replica" + ) } (ComputeMode::Primary, true) => { - bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") + bail!( + "Cannot start a node as a hot standby replica, it is already configured as primary node" + ) } _ => {} } @@ -1368,6 +1367,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res let pageserver_id = args.endpoint_pageserver_id; let remote_ext_config = &args.remote_ext_config; + let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { @@ -1443,11 +1443,13 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res endpoint .start( &auth_token, + safekeepers_generation, safekeepers, pageservers, remote_ext_config.as_ref(), stripe_size.0 as usize, args.create_test_user, + args.start_timeout, ) .await?; } diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index c8ac5d8981..1b507bb384 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -8,7 +8,6 @@ use std::time::Duration; use anyhow::Context; - use camino::Utf8PathBuf; use crate::{background_process, local_env}; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 407578abb8..b46d616827 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -37,29 +37,24 @@ //! ``` //! use std::collections::BTreeMap; -use std::net::IpAddr; -use std::net::Ipv4Addr; -use std::net::SocketAddr; -use std::net::TcpStream; +use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream}; use std::path::PathBuf; use std::process::Command; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::SystemTime; -use std::time::UNIX_EPOCH; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{Context, Result, anyhow, bail}; use compute_api::requests::ConfigurationRequest; -use compute_api::responses::ComputeCtlConfig; -use compute_api::spec::Database; -use compute_api::spec::PgIdent; -use compute_api::spec::RemoteExtSpec; -use compute_api::spec::Role; -use nix::sys::signal::kill; -use nix::sys::signal::Signal; +use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse}; +use compute_api::spec::{ + Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, + RemoteExtSpec, Role, +}; +use nix::sys::signal::{Signal, kill}; use pageserver_api::shard::ShardStripeSize; use reqwest::header::CONTENT_TYPE; +use safekeeper_api::membership::SafekeeperGeneration; use serde::{Deserialize, Serialize}; use tracing::debug; use url::Host; @@ -69,9 +64,6 @@ use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; use crate::storage_controller::StorageController; -use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; -use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec}; - // contents of a endpoint.json file #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct EndpointConf { @@ -237,7 +229,9 @@ impl ComputeControlPlane { }); if let Some((key, _)) = duplicates.next() { - bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported."); + bail!( + "attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported." + ); } } Ok(()) @@ -584,14 +578,17 @@ impl Endpoint { Ok(safekeeper_connstrings) } + #[allow(clippy::too_many_arguments)] pub async fn start( &self, auth_token: &Option, + safekeepers_generation: Option, safekeepers: Vec, pageservers: Vec<(Host, u16)>, remote_ext_config: Option<&String>, shard_stripe_size: usize, create_test_user: bool, + start_timeout: Duration, ) -> Result<()> { if self.status() == EndpointStatus::Running { anyhow::bail!("The endpoint is already running"); @@ -663,6 +660,7 @@ impl Endpoint { timeline_id: Some(self.timeline_id), mode: self.mode, pageserver_connstring: Some(pageserver_connstring), + safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()), safekeeper_connstrings, storage_auth_token: auth_token.clone(), remote_extensions, @@ -671,6 +669,7 @@ impl Endpoint { local_proxy_config: None, reconfigure_concurrency: self.reconfigure_concurrency, drop_subscriptions_before_start: self.drop_subscriptions_before_start, + audit_log_level: ComputeAudit::Disabled, }; // this strange code is needed to support respec() in tests @@ -778,17 +777,18 @@ impl Endpoint { std::fs::write(pidfile_path, pid.to_string())?; // Wait for it to start - let mut attempt = 0; const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100); - const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min + let start_at = Instant::now(); loop { - attempt += 1; match self.get_status().await { Ok(state) => { match state.status { ComputeStatus::Init => { - if attempt == MAX_ATTEMPTS { - bail!("compute startup timed out; still in Init state"); + if Instant::now().duration_since(start_at) > start_timeout { + bail!( + "compute startup timed out {:?}; still in Init state", + start_timeout + ); } // keep retrying } @@ -815,8 +815,11 @@ impl Endpoint { } } Err(e) => { - if attempt == MAX_ATTEMPTS { - return Err(e).context("timed out waiting to connect to compute_ctl HTTP"); + if Instant::now().duration_since(start_at) > start_timeout { + return Err(e).context(format!( + "timed out {:?} waiting to connect to compute_ctl HTTP", + start_timeout, + )); } } } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2fe4cd5202..f0a11106bd 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,28 +3,22 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, Context}; +use std::collections::HashMap; +use std::net::{IpAddr, Ipv4Addr, SocketAddr}; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::time::Duration; +use std::{env, fs}; +use anyhow::{Context, bail}; use clap::ValueEnum; use postgres_backend::AuthType; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::env; -use std::fs; -use std::net::IpAddr; -use std::net::Ipv4Addr; -use std::net::SocketAddr; -use std::path::{Path, PathBuf}; -use std::process::{Command, Stdio}; -use std::time::Duration; -use utils::{ - auth::{encode_from_key_file, Claims}, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, -}; +use utils::auth::{Claims, encode_from_key_file}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; -use crate::pageserver::PageServerNode; -use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR; +use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode}; use crate::safekeeper::SafekeeperNode; pub const DEFAULT_PG_VERSION: u32 = 16; @@ -78,15 +72,19 @@ pub struct LocalEnv { // be propagated into each pageserver's configuration. pub control_plane_api: Url, - // Control plane upcall API for storage controller. If set, this will be propagated into the + // Control plane upcall APIs for storage controller. If set, this will be propagated into the // storage controller's configuration. - pub control_plane_compute_hook_api: Option, + pub control_plane_hooks_api: Option, /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". pub branch_name_mappings: HashMap>, + + /// Flag to generate SSL certificates for components that need it. + /// Also generates root CA certificate that is used to sign all other certificates. + pub generate_local_ssl_certs: bool, } /// On-disk state stored in `.neon/config`. @@ -106,8 +104,13 @@ pub struct OnDiskConfig { pub pageservers: Vec, pub safekeepers: Vec, pub control_plane_api: Option, + pub control_plane_hooks_api: Option, pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, + // Note: skip serializing because in compat tests old storage controller fails + // to load new config file. May be removed after this field is in release branch. + #[serde(skip_serializing_if = "std::ops::Not::not")] + pub generate_local_ssl_certs: bool, } fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result, D::Error> @@ -134,7 +137,8 @@ pub struct NeonLocalInitConf { pub pageservers: Vec, pub safekeepers: Vec, pub control_plane_api: Option, - pub control_plane_compute_hook_api: Option>, + pub control_plane_hooks_api: Option, + pub generate_local_ssl_certs: bool, } /// Broker config for cluster internal communication. @@ -145,7 +149,7 @@ pub struct NeonBroker { pub listen_addr: SocketAddr, } -/// Broker config for cluster internal communication. +/// A part of storage controller's config the neon_local knows about. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct NeonStorageControllerConf { @@ -171,6 +175,12 @@ pub struct NeonStorageControllerConf { #[serde(with = "humantime_serde")] pub long_reconcile_threshold: Option, + + pub use_https_pageserver_api: bool, + + pub timelines_onto_safekeepers: bool, + + pub use_https_safekeeper_api: bool, } impl NeonStorageControllerConf { @@ -194,6 +204,9 @@ impl Default for NeonStorageControllerConf { max_secondary_lag_bytes: None, heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, long_reconcile_threshold: None, + use_https_pageserver_api: false, + timelines_onto_safekeepers: false, + use_https_safekeeper_api: false, } } } @@ -223,6 +236,7 @@ pub struct PageServerConf { pub id: NodeId, pub listen_pg_addr: String, pub listen_http_addr: String, + pub listen_https_addr: Option, pub pg_auth_type: AuthType, pub http_auth_type: AuthType, pub no_sync: bool, @@ -234,6 +248,7 @@ impl Default for PageServerConf { id: NodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), + listen_https_addr: None, pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, no_sync: false, @@ -249,6 +264,7 @@ pub struct NeonLocalInitPageserverConf { pub id: NodeId, pub listen_pg_addr: String, pub listen_http_addr: String, + pub listen_https_addr: Option, pub pg_auth_type: AuthType, pub http_auth_type: AuthType, #[serde(default, skip_serializing_if = "std::ops::Not::not")] @@ -263,6 +279,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf { id, listen_pg_addr, listen_http_addr, + listen_https_addr, pg_auth_type, http_auth_type, no_sync, @@ -272,6 +289,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf { id: *id, listen_pg_addr: listen_pg_addr.clone(), listen_http_addr: listen_http_addr.clone(), + listen_https_addr: listen_https_addr.clone(), pg_auth_type: *pg_auth_type, http_auth_type: *http_auth_type, no_sync: *no_sync, @@ -286,6 +304,7 @@ pub struct SafekeeperConf { pub pg_port: u16, pub pg_tenant_only_port: Option, pub http_port: u16, + pub https_port: Option, pub sync: bool, pub remote_storage: Option, pub backup_threads: Option, @@ -300,6 +319,7 @@ impl Default for SafekeeperConf { pg_port: 0, pg_tenant_only_port: None, http_port: 0, + https_port: None, sync: true, remote_storage: None, backup_threads: None, @@ -416,6 +436,41 @@ impl LocalEnv { } } + pub fn ssl_ca_cert_path(&self) -> Option { + if self.generate_local_ssl_certs { + Some(self.base_data_dir.join("rootCA.crt")) + } else { + None + } + } + + pub fn ssl_ca_key_path(&self) -> Option { + if self.generate_local_ssl_certs { + Some(self.base_data_dir.join("rootCA.key")) + } else { + None + } + } + + pub fn generate_ssl_ca_cert(&self) -> anyhow::Result<()> { + let cert_path = self.ssl_ca_cert_path().unwrap(); + let key_path = self.ssl_ca_key_path().unwrap(); + if !fs::exists(cert_path.as_path())? { + generate_ssl_ca_cert(cert_path.as_path(), key_path.as_path())?; + } + Ok(()) + } + + pub fn generate_ssl_cert(&self, cert_path: &Path, key_path: &Path) -> anyhow::Result<()> { + self.generate_ssl_ca_cert()?; + generate_ssl_cert( + cert_path, + key_path, + self.ssl_ca_cert_path().unwrap().as_path(), + self.ssl_ca_key_path().unwrap().as_path(), + ) + } + /// Inspect the base data directory and extract the instance id and instance directory path /// for all storage controller instances pub async fn storage_controller_instances(&self) -> std::io::Result> { @@ -465,7 +520,9 @@ impl LocalEnv { if old_timeline_id == &timeline_id { Ok(()) } else { - bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"); + bail!( + "branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}" + ); } } else { existing_values.push((tenant_id, timeline_id)); @@ -521,8 +578,10 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api, - control_plane_compute_hook_api, + control_plane_hooks_api, + control_plane_compute_hook_api: _, branch_name_mappings, + generate_local_ssl_certs, } = on_disk_config; LocalEnv { base_data_dir: repopath.to_owned(), @@ -535,8 +594,9 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api: control_plane_api.unwrap(), - control_plane_compute_hook_api, + control_plane_hooks_api, branch_name_mappings, + generate_local_ssl_certs, } }; @@ -572,6 +632,7 @@ impl LocalEnv { struct PageserverConfigTomlSubset { listen_pg_addr: String, listen_http_addr: String, + listen_https_addr: Option, pg_auth_type: AuthType, http_auth_type: AuthType, #[serde(default)] @@ -596,6 +657,7 @@ impl LocalEnv { let PageserverConfigTomlSubset { listen_pg_addr, listen_http_addr, + listen_https_addr, pg_auth_type, http_auth_type, no_sync, @@ -613,6 +675,7 @@ impl LocalEnv { }, listen_pg_addr, listen_http_addr, + listen_https_addr, pg_auth_type, http_auth_type, no_sync, @@ -638,8 +701,10 @@ impl LocalEnv { pageservers: vec![], // it's skip_serializing anyway safekeepers: self.safekeepers.clone(), control_plane_api: Some(self.control_plane_api.clone()), - control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), + control_plane_hooks_api: self.control_plane_hooks_api.clone(), + control_plane_compute_hook_api: None, branch_name_mappings: self.branch_name_mappings.clone(), + generate_local_ssl_certs: self.generate_local_ssl_certs, }, ) } @@ -721,7 +786,8 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api, - control_plane_compute_hook_api, + generate_local_ssl_certs, + control_plane_hooks_api, } = conf; // Find postgres binaries. @@ -768,16 +834,24 @@ impl LocalEnv { pageservers: pageservers.iter().map(Into::into).collect(), safekeepers, control_plane_api: control_plane_api.unwrap(), - control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), + control_plane_hooks_api, branch_name_mappings: Default::default(), + generate_local_ssl_certs, }; + if generate_local_ssl_certs { + env.generate_ssl_ca_cert()?; + } + // create endpoints dir fs::create_dir_all(env.endpoints_path())?; // create safekeeper dirs for safekeeper in &env.safekeepers { fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?; + SafekeeperNode::from_env(&env, safekeeper) + .initialize() + .context("safekeeper init failed")?; } // initialize pageserver state @@ -855,3 +929,80 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow } Ok(()) } + +fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> { + // openssl req -x509 -newkey rsa:2048 -nodes -subj "/CN=Neon Local CA" -days 36500 \ + // -out rootCA.crt -keyout rootCA.key + let keygen_output = Command::new("openssl") + .args([ + "req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500", + ]) + .args(["-subj", "/CN=Neon Local CA"]) + .args(["-out", cert_path.to_str().unwrap()]) + .args(["-keyout", key_path.to_str().unwrap()]) + .output() + .context("failed to generate CA certificate")?; + if !keygen_output.status.success() { + bail!( + "openssl failed: '{}'", + String::from_utf8_lossy(&keygen_output.stderr) + ); + } + Ok(()) +} + +fn generate_ssl_cert( + cert_path: &Path, + key_path: &Path, + ca_cert_path: &Path, + ca_key_path: &Path, +) -> anyhow::Result<()> { + // Generate Certificate Signing Request (CSR). + let mut csr_path = cert_path.to_path_buf(); + csr_path.set_extension(".csr"); + + // openssl req -new -nodes -newkey rsa:2048 -keyout server.key -out server.csr \ + // -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" + let keygen_output = Command::new("openssl") + .args(["req", "-new", "-nodes"]) + .args(["-newkey", "rsa:2048"]) + .args(["-subj", "/CN=localhost"]) + .args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"]) + .args(["-keyout", key_path.to_str().unwrap()]) + .args(["-out", csr_path.to_str().unwrap()]) + .output() + .context("failed to generate CSR")?; + if !keygen_output.status.success() { + bail!( + "openssl failed: '{}'", + String::from_utf8_lossy(&keygen_output.stderr) + ); + } + + // Sign CSR with CA key. + // + // openssl x509 -req -in server.csr -CA rootCA.crt -CAkey rootCA.key -CAcreateserial \ + // -out server.crt -days 36500 -copy_extensions copyall + let keygen_output = Command::new("openssl") + .args(["x509", "-req"]) + .args(["-in", csr_path.to_str().unwrap()]) + .args(["-CA", ca_cert_path.to_str().unwrap()]) + .args(["-CAkey", ca_key_path.to_str().unwrap()]) + .arg("-CAcreateserial") + .args(["-out", cert_path.to_str().unwrap()]) + .args(["-days", "36500"]) + .args(["-copy_extensions", "copyall"]) + .output() + .context("failed to sign CSR")?; + if !keygen_output.status.success() { + bail!( + "openssl failed: '{}'", + String::from_utf8_lossy(&keygen_output.stderr) + ); + } + + // Remove CSR file as it's not needed anymore. + fs::remove_file(csr_path)?; + + Ok(()) +} diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 2bf89b7bfa..eeaad10d26 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -7,7 +7,6 @@ //! ``` //! use std::collections::HashMap; - use std::io; use std::io::Write; use std::num::NonZeroU64; @@ -15,22 +14,20 @@ use std::path::PathBuf; use std::str::FromStr; use std::time::Duration; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use camino::Utf8PathBuf; use pageserver_api::models::{self, TenantInfo, TimelineInfo}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; -use postgres_connection::{parse_host_port, PgConnectionConfig}; +use postgres_connection::{PgConnectionConfig, parse_host_port}; +use reqwest::Certificate; use utils::auth::{Claims, Scope}; -use utils::id::NodeId; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; -use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf}; -use crate::{background_process, local_env::LocalEnv}; +use crate::background_process; +use crate::local_env::{LocalEnv, NeonLocalInitPageserverConf, PageServerConf}; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver"; @@ -53,12 +50,29 @@ impl PageServerNode { let (host, port) = parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); let port = port.unwrap_or(5432); + + let ssl_ca_cert = env.ssl_ca_cert_path().map(|ssl_ca_file| { + let buf = std::fs::read(ssl_ca_file).expect("SSL root CA file should exist"); + Certificate::from_pem(&buf).expect("CA certificate should be valid") + }); + + let endpoint = if env.storage_controller.use_https_pageserver_api { + format!( + "https://{}", + conf.listen_https_addr.as_ref().expect( + "listen https address should be specified if use_https_pageserver_api is on" + ) + ) + } else { + format!("http://{}", conf.listen_http_addr) + }; + Self { pg_connection_config: PgConnectionConfig::new_host_port(host, port), conf: conf.clone(), env: env.clone(), http_client: mgmt_api::Client::new( - format!("http://{}", conf.listen_http_addr), + endpoint, { match conf.http_auth_type { AuthType::Trust => None, @@ -69,7 +83,9 @@ impl PageServerNode { } } .as_deref(), - ), + ssl_ca_cert, + ) + .expect("Client constructs with no errors"), } } @@ -81,7 +97,11 @@ impl PageServerNode { &self, conf: NeonLocalInitPageserverConf, ) -> anyhow::Result { - assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); + assert_eq!( + &PageServerConf::from(&conf), + &self.conf, + "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully" + ); // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) @@ -220,6 +240,13 @@ impl PageServerNode { .context("write identity toml")?; drop(identity_toml); + if self.env.generate_local_ssl_certs { + self.env.generate_ssl_cert( + datadir.join("server.crt").as_path(), + datadir.join("server.key").as_path(), + )?; + } + // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config // Write metadata file, used by pageserver on startup to register itself with @@ -230,6 +257,15 @@ impl PageServerNode { parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr"); let http_port = http_port.unwrap_or(9898); + let https_port = match self.conf.listen_https_addr.as_ref() { + Some(https_addr) => { + let (_https_host, https_port) = + parse_host_port(https_addr).expect("Unable to parse listen_https_addr"); + Some(https_port.unwrap_or(9899)) + } + None => None, + }; + // Intentionally hand-craft JSON: this acts as an implicit format compat test // in case the pageserver-side structure is edited, and reflects the real life // situation: the metadata is written by some other script. @@ -240,6 +276,7 @@ impl PageServerNode { postgres_port: self.pg_connection_config.port(), http_host: "localhost".to_string(), http_port, + https_port, other: HashMap::from([( "availability_zone_id".to_string(), serde_json::json!(az_id), diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index 5aee12dc97..a824af9490 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -1,3 +1,6 @@ +use std::collections::HashMap; +use std::fmt; + /// /// Module for parsing postgresql.conf file. /// @@ -6,8 +9,6 @@ /// funny stuff like include-directives or funny escaping. use once_cell::sync::Lazy; use regex::Regex; -use std::collections::HashMap; -use std::fmt; /// In-memory representation of a postgresql.conf file #[derive(Default, Debug)] diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index ce7751fb14..231871852e 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -14,18 +14,15 @@ use std::{io, result}; use anyhow::Context; use camino::Utf8PathBuf; +use http_utils::error::HttpErrorBody; use postgres_connection::PgConnectionConfig; use reqwest::{IntoUrl, Method}; use thiserror::Error; - -use http_utils::error::HttpErrorBody; use utils::auth::{Claims, Scope}; use utils::id::NodeId; -use crate::{ - background_process, - local_env::{LocalEnv, SafekeeperConf}, -}; +use crate::background_process; +use crate::local_env::{LocalEnv, SafekeeperConf}; #[derive(Error, Debug)] pub enum SafekeeperHttpError { @@ -114,6 +111,18 @@ impl SafekeeperNode { .expect("non-Unicode path") } + /// Initializes a safekeeper node by creating all necessary files, + /// e.g. SSL certificates. + pub fn initialize(&self) -> anyhow::Result<()> { + if self.env.generate_local_ssl_certs { + self.env.generate_ssl_cert( + &self.datadir_path().join("server.crt"), + &self.datadir_path().join("server.key"), + )?; + } + Ok(()) + } + pub async fn start( &self, extra_opts: &[String], @@ -199,6 +208,16 @@ impl SafekeeperNode { ]); } + if let Some(https_port) = self.conf.https_port { + args.extend([ + "--listen-https".to_owned(), + format!("{}:{}", self.listen_addr, https_port), + ]); + } + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { + args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); + } + args.extend_from_slice(extra_opts); background_process::start_process( diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 0fadb9c5fe..0c78f2e18e 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -1,44 +1,36 @@ -use crate::{ - background_process, - local_env::{LocalEnv, NeonStorageControllerConf}, -}; +use std::ffi::OsStr; +use std::fs; +use std::net::SocketAddr; +use std::path::PathBuf; +use std::process::ExitStatus; +use std::str::FromStr; +use std::sync::OnceLock; +use std::time::{Duration, Instant}; + use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; -use pageserver_api::{ - controller_api::{ - NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, - TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest, - TenantShardMigrateResponse, - }, - models::{ - TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, - }, - shard::{ShardStripeSize, TenantShardId}, +use pageserver_api::controller_api::{ + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + TenantCreateResponse, TenantLocateResponse, }; +use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; use reqwest::Method; -use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{ - ffi::OsStr, - fs, - net::SocketAddr, - path::PathBuf, - process::ExitStatus, - str::FromStr, - sync::OnceLock, - time::{Duration, Instant}, -}; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; use tokio::process::Command; use tracing::instrument; use url::Url; -use utils::{ - auth::{encode_from_key_file, Claims, Scope}, - id::{NodeId, TenantId}, -}; +use utils::auth::{Claims, Scope, encode_from_key_file}; +use utils::id::{NodeId, TenantId}; use whoami::username; +use crate::background_process; +use crate::local_env::{LocalEnv, NeonStorageControllerConf}; + pub struct StorageController { env: LocalEnv, private_key: Option>, @@ -96,7 +88,8 @@ pub struct AttachHookRequest { #[derive(Serialize, Deserialize)] pub struct AttachHookResponse { - pub gen: Option, + #[serde(rename = "gen")] + pub generation: Option, } #[derive(Serialize, Deserialize)] @@ -541,6 +534,18 @@ impl StorageController { args.push("--start-as-candidate".to_string()); } + if self.config.use_https_pageserver_api { + args.push("--use-https-pageserver-api".to_string()); + } + + if self.config.use_https_safekeeper_api { + args.push("--use-https-safekeeper-api".to_string()); + } + + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { + args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); + } + if let Some(private_key) = &self.private_key { let claims = Claims::new(None, Scope::PageServerApi); let jwt_token = @@ -557,10 +562,8 @@ impl StorageController { args.push(format!("--public-key=\"{public_key}\"")); } - if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api { - args.push(format!( - "--compute-hook-url={control_plane_compute_hook_api}" - )); + if let Some(control_plane_hooks_api) = &self.env.control_plane_hooks_api { + args.push(format!("--control-plane-url={control_plane_hooks_api}")); } if let Some(split_threshold) = self.config.split_threshold.as_ref() { @@ -583,6 +586,10 @@ impl StorageController { self.env.base_data_dir.display() )); + if self.config.timelines_onto_safekeepers { + args.push("--timelines-onto-safekeepers".to_string()); + } + background_process::start_process( COMMAND, &instance_dir, @@ -779,7 +786,7 @@ impl StorageController { ) .await?; - Ok(response.gen) + Ok(response.generation) } #[instrument(skip(self))] @@ -829,41 +836,6 @@ impl StorageController { .await } - #[instrument(skip(self))] - pub async fn tenant_migrate( - &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - ) -> anyhow::Result { - self.dispatch( - Method::PUT, - format!("control/v1/tenant/{tenant_shard_id}/migrate"), - Some(TenantShardMigrateRequest { - node_id, - migration_config: None, - }), - ) - .await - } - - #[instrument(skip(self), fields(%tenant_id, %new_shard_count))] - pub async fn tenant_split( - &self, - tenant_id: TenantId, - new_shard_count: u8, - new_stripe_size: Option, - ) -> anyhow::Result { - self.dispatch( - Method::PUT, - format!("control/v1/tenant/{tenant_id}/shard_split"), - Some(TenantShardSplitRequest { - new_shard_count, - new_stripe_size, - }), - ) - .await - } - #[instrument(skip_all, fields(node_id=%req.node_id))] pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> { self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req)) @@ -908,4 +880,9 @@ impl StorageController { ) .await } + + pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> anyhow::Result<()> { + self.dispatch(Method::PUT, "v1/tenant/config".to_string(), Some(req)) + .await + } } diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 40b86e4110..ae4bf9a519 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,34 +1,28 @@ -use futures::StreamExt; -use std::{ - collections::{HashMap, HashSet}, - str::FromStr, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::path::PathBuf; +use std::str::FromStr; +use std::time::Duration; use clap::{Parser, Subcommand}; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, - SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy, - ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, - TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, - }, - models::{ - EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, - ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, - TenantShardSplitRequest, TenantShardSplitResponse, - }, - shard::{ShardStripeSize, TenantShardId}, -}; -use pageserver_client::mgmt_api::{self}; -use reqwest::{Method, StatusCode, Url}; -use utils::id::{NodeId, TenantId, TimelineId}; - +use futures::StreamExt; use pageserver_api::controller_api::{ - NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, + AvailabilityZone, MigrationConfig, NodeAvailabilityWrapper, NodeConfigureRequest, + NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse, + PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, + ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, + SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, }; +use pageserver_api::models::{ + EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig, + TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest, + TenantShardSplitResponse, +}; +use pageserver_api::shard::{ShardStripeSize, TenantShardId}; +use pageserver_client::mgmt_api::{self}; +use reqwest::{Method, StatusCode, Url}; use storage_controller_client::control_api::Client; +use utils::id::{NodeId, TenantId, TimelineId}; #[derive(Subcommand, Debug)] enum Command { @@ -119,6 +113,15 @@ enum Command { tenant_shard_id: TenantShardId, #[arg(long)] node: NodeId, + #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + prewarm: bool, + #[arg(long, default_value_t = false, action = clap::ArgAction::Set)] + override_scheduler: bool, + }, + /// Watch the location of a tenant shard evolve, e.g. while expecting it to migrate + TenantShardWatch { + #[arg(long)] + tenant_shard_id: TenantShardId, }, /// Migrate the secondary location for a tenant shard to a specific pageserver. TenantShardMigrateSecondary { @@ -155,12 +158,6 @@ enum Command { #[arg(long)] tenant_id: TenantId, }, - /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary - /// mode so that it can warm up content on a pageserver. - TenantWarmup { - #[arg(long)] - tenant_id: TenantId, - }, TenantSetPreferredAz { #[arg(long)] tenant_id: TenantId, @@ -276,6 +273,10 @@ struct Cli { /// a token with both scopes to use with this tool. jwt: Option, + #[arg(long)] + /// Trusted root CA certificate to use in https APIs. + ssl_ca_file: Option, + #[command(subcommand)] command: Command, } @@ -386,9 +387,17 @@ async fn main() -> anyhow::Result<()> { let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone()); + let ssl_ca_cert = match &cli.ssl_ca_file { + Some(ssl_ca_file) => { + let buf = tokio::fs::read(ssl_ca_file).await?; + Some(reqwest::Certificate::from_pem(&buf)?) + } + None => None, + }; + let mut trimmed = cli.api.to_string(); trimmed.pop(); - let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref()); + let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref(), ssl_ca_cert)?; match cli.command { Command::NodeRegister { @@ -626,19 +635,43 @@ async fn main() -> anyhow::Result<()> { Command::TenantShardMigrate { tenant_shard_id, node, + prewarm, + override_scheduler, } => { - let req = TenantShardMigrateRequest { - node_id: node, - migration_config: None, + let migration_config = MigrationConfig { + prewarm, + override_scheduler, + ..Default::default() }; - storcon_client + let req = TenantShardMigrateRequest { + node_id: node, + origin_node_id: None, + migration_config, + }; + + match storcon_client .dispatch::( Method::PUT, format!("control/v1/tenant/{tenant_shard_id}/migrate"), Some(req), ) - .await?; + .await + { + Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) => { + anyhow::bail!( + "Migration to {node} rejected, may require `--force` ({}) ", + msg + ); + } + Err(e) => return Err(e.into()), + Ok(_) => {} + } + + watch_tenant_shard(storcon_client, tenant_shard_id, Some(node)).await?; + } + Command::TenantShardWatch { tenant_shard_id } => { + watch_tenant_shard(storcon_client, tenant_shard_id, None).await?; } Command::TenantShardMigrateSecondary { tenant_shard_id, @@ -646,7 +679,8 @@ async fn main() -> anyhow::Result<()> { } => { let req = TenantShardMigrateRequest { node_id: node, - migration_config: None, + origin_node_id: None, + migration_config: MigrationConfig::default(), }; storcon_client @@ -831,97 +865,11 @@ async fn main() -> anyhow::Result<()> { ) .await?; } - Command::TenantWarmup { tenant_id } => { - let describe_response = storcon_client - .dispatch::<(), TenantDescribeResponse>( - Method::GET, - format!("control/v1/tenant/{tenant_id}"), - None, - ) - .await; - match describe_response { - Ok(describe) => { - if matches!(describe.policy, PlacementPolicy::Secondary) { - // Fine: it's already known to controller in secondary mode: calling - // again to put it into secondary mode won't cause problems. - } else { - anyhow::bail!("Tenant already present with policy {:?}", describe.policy); - } - } - Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => { - // Fine: this tenant isn't know to the storage controller yet. - } - Err(e) => { - // Unexpected API error - return Err(e.into()); - } - } - - vps_client - .location_config( - TenantShardId::unsharded(tenant_id), - pageserver_api::models::LocationConfig { - mode: pageserver_api::models::LocationConfigMode::Secondary, - generation: None, - secondary_conf: Some(LocationConfigSecondary { warm: true }), - shard_number: 0, - shard_count: 0, - shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0, - tenant_conf: TenantConfig::default(), - }, - None, - true, - ) - .await?; - - let describe_response = storcon_client - .dispatch::<(), TenantDescribeResponse>( - Method::GET, - format!("control/v1/tenant/{tenant_id}"), - None, - ) - .await?; - - let secondary_ps_id = describe_response - .shards - .first() - .unwrap() - .node_secondary - .first() - .unwrap(); - - println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}"); - loop { - let (status, progress) = vps_client - .tenant_secondary_download( - TenantShardId::unsharded(tenant_id), - Some(Duration::from_secs(10)), - ) - .await?; - println!( - "Progress: {}/{} layers, {}/{} bytes", - progress.layers_downloaded, - progress.layers_total, - progress.bytes_downloaded, - progress.bytes_total - ); - match status { - StatusCode::OK => { - println!("Download complete"); - break; - } - StatusCode::ACCEPTED => { - // Loop - } - _ => { - anyhow::bail!("Unexpected download status: {status}"); - } - } - } - } Command::TenantDrop { tenant_id, unclean } => { if !unclean { - anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed.") + anyhow::bail!( + "This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed." + ) } storcon_client .dispatch::<(), ()>( @@ -933,7 +881,9 @@ async fn main() -> anyhow::Result<()> { } Command::NodeDrop { node_id, unclean } => { if !unclean { - anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed.") + anyhow::bail!( + "This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed." + ) } storcon_client .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None) @@ -1108,7 +1058,8 @@ async fn main() -> anyhow::Result<()> { format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), Some(TenantShardMigrateRequest { node_id: mv.to, - migration_config: None, + origin_node_id: Some(mv.from), + migration_config: MigrationConfig::default(), }), ) .await @@ -1287,3 +1238,68 @@ async fn main() -> anyhow::Result<()> { Ok(()) } + +static WATCH_INTERVAL: Duration = Duration::from_secs(5); + +async fn watch_tenant_shard( + storcon_client: Client, + tenant_shard_id: TenantShardId, + until_migrated_to: Option, +) -> anyhow::Result<()> { + if let Some(until_migrated_to) = until_migrated_to { + println!( + "Waiting for tenant shard {} to be migrated to node {}", + tenant_shard_id, until_migrated_to + ); + } + + loop { + let desc = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{}", tenant_shard_id.tenant_id), + None, + ) + .await?; + + // Output the current state of the tenant shard + let shard = desc + .shards + .iter() + .find(|s| s.tenant_shard_id == tenant_shard_id) + .ok_or(anyhow::anyhow!("Tenant shard not found"))?; + let summary = format!( + "attached: {} secondary: {} {}", + shard + .node_attached + .map(|n| format!("{}", n)) + .unwrap_or("none".to_string()), + shard + .node_secondary + .iter() + .map(|n| n.to_string()) + .collect::>() + .join(","), + if shard.is_reconciling { + "(reconciler active)" + } else { + "(reconciler idle)" + } + ); + println!("{}", summary); + + // Maybe drop out if we finished migration + if let Some(until_migrated_to) = until_migrated_to { + if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling { + println!( + "Tenant shard {} is now on node {}", + tenant_shard_id, until_migrated_to + ); + break; + } + } + + tokio::time::sleep(WATCH_INTERVAL).await; + } + Ok(()) +} diff --git a/deny.toml b/deny.toml index b551405568..ed7aa9ef9f 100644 --- a/deny.toml +++ b/deny.toml @@ -27,6 +27,10 @@ yanked = "warn" id = "RUSTSEC-2023-0071" reason = "the marvin attack only affects private key decryption, not public key signature verification" +[[advisories.ignore]] +id = "RUSTSEC-2024-0436" +reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact." + # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 489d60f38c..95d4ff7b2a 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -186,7 +186,7 @@ services: neon-test-extensions: profiles: ["test-extensions"] - image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest} + image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} environment: - PGPASSWORD=cloud_admin entrypoint: diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 5b3cfc74eb..0f03d600a3 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -51,8 +51,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do done if [ $pg_version -ge 16 ]; then - docker cp ext-src $TEST_CONTAINER_NAME:/ - docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl" # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail # It cannot be moved to Dockerfile now because the database directory is created after the start of the container echo Adding dummy config diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-v16.patch similarity index 100% rename from docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch rename to docker-compose/ext-src/pg_semver-src/test-upgrade-v16.patch diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-v17.patch similarity index 100% rename from docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch rename to docker-compose/ext-src/pg_semver-src/test-upgrade-v17.patch diff --git a/docker-compose/ext-src/pgtap-src/test-upgrade.patch b/docker-compose/ext-src/pgtap-src/test-upgrade.patch index a4c46e93ce..c050ab8d00 100644 --- a/docker-compose/ext-src/pgtap-src/test-upgrade.patch +++ b/docker-compose/ext-src/pgtap-src/test-upgrade.patch @@ -7,7 +7,7 @@ index f255fe6..0a0fa65 100644 GENERATED_SCHEDULE_DEPS = $(TB_DIR)/all_tests $(TB_DIR)/exclude_tests REGRESS = --schedule $(TB_DIR)/run.sch # Set this again just to be safe -REGRESS_OPTS = --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) -+REGRESS_OPTS = --use-existing --dbname=pgtap_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) ++REGRESS_OPTS = --use-existing --dbname=contrib_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) SETUP_SCH = test/schedule/main.sch # schedule to use for test setup; this can be forcibly changed by some targets! IGNORE_TESTS = $(notdir $(EXCLUDE_TEST_FILES:.sql=)) PARALLEL_TESTS = $(filter-out $(IGNORE_TESTS),$(filter-out $(SERIAL_TESTS),$(ALL_TESTS))) diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh index 06d351b496..51d1e40802 100755 --- a/docker-compose/test_extensions_upgrade.sh +++ b/docker-compose/test_extensions_upgrade.sh @@ -6,12 +6,16 @@ generate_id() { local -n resvar=$1 printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM } -if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then - echo OLDTAG and NEWTAG must be defined +echo "${OLD_COMPUTE_TAG}" +echo "${NEW_COMPUTE_TAG}" +echo "${TEST_EXTENSIONS_TAG}" +if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then + echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set exit 1 fi export PG_VERSION=${PG_VERSION:-16} export PG_TEST_VERSION=${PG_VERSION} +# Waits for compute node is ready function wait_for_ready { TIME=0 while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do @@ -23,11 +27,45 @@ function wait_for_ready { exit 2 fi } +# Creates extensions. Gets a string with space-separated extensions as a parameter function create_extensions() { for ext in ${1}; do docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE" done } +# Creates a new timeline. Gets the parent ID and an extension name as parameters. +# Saves the timeline ID in the variable EXT_TIMELINE +function create_timeline() { + generate_id new_timeline_id + + PARAMS=( + -sbf + -X POST + -H "Content-Type: application/json" + -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${1}\"}" + "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/" + ) + result=$(curl "${PARAMS[@]}") + echo $result | jq . + EXT_TIMELINE[${2}]=${new_timeline_id} +} +# Checks if the timeline ID of the compute node is expected. Gets the timeline ID as a parameter +function check_timeline() { + TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") + if [ "${TID}" != "${1}" ]; then + echo Timeline mismatch + exit 1 + fi +} +# Restarts the compute node with the required compute tag and timeline. +# Accepts the tag for the compute node and the timeline as parameters. +function restart_compute() { + docker compose down compute compute_is_ready + COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready + wait_for_ready + check_timeline ${2} +} +declare -A EXT_TIMELINE EXTENSIONS='[ {"extname": "plv8", "extdir": "plv8-src"}, {"extname": "vector", "extdir": "pgvector-src"}, @@ -47,7 +85,7 @@ EXTENSIONS='[ {"extname": "pg_repack", "extdir": "pg_repack-src"} ]' EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -) -TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d +COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d wait_for_ready docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" @@ -55,13 +93,14 @@ create_extensions "${EXTNAMES}" query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')" new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") docker compose --profile test-extensions down -TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate +COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate wait_for_ready -docker compose cp ext-src neon-test-extensions:/ docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" -docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression" -docker compose exec neon-test-extensions psql -d pgtap_regression -c "CREATE EXTENSION pgtap" +tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id") +EXT_TIMELINE["main"]=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") +create_timeline "${EXT_TIMELINE["main"]}" init +restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE["init"]}" create_extensions "${EXTNAMES}" if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then exts="${EXTNAMES}" @@ -72,29 +111,13 @@ fi if [ -z "${exts}" ]; then echo "No extensions were upgraded" else - tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id") - timeline_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") for ext in ${exts}; do echo Testing ${ext}... + create_timeline "${EXT_TIMELINE["main"]}" ${ext} EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir') - generate_id new_timeline_id - PARAMS=( - -sbf - -X POST - -H "Content-Type: application/json" - -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${timeline_id}\"}" - "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/" - ) - result=$(curl "${PARAMS[@]}") - echo $result | jq . - TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready - COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready - wait_for_ready - TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") - if [ ${TID} != ${new_timeline_id} ]; then - echo Timeline mismatch - exit 1 - fi + restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}" + docker compose exec neon-test-extensions psql -d contrib_regression -c "CREATE EXTENSION ${ext} CASCADE" + restart_compute "${NEW_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}" docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}" if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then docker compose exec neon-test-extensions cat /ext-src/${EXTDIR}/regression.diffs diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/001-cluster-size-limits.md similarity index 100% rename from docs/rfcs/cluster-size-limits.md rename to docs/rfcs/001-cluster-size-limits.md diff --git a/docs/rfcs/041-rel-sparse-keyspace.md b/docs/rfcs/041-rel-sparse-keyspace.md new file mode 100644 index 0000000000..03e68bd5c1 --- /dev/null +++ b/docs/rfcs/041-rel-sparse-keyspace.md @@ -0,0 +1,201 @@ +# Sparse Keyspace for Relation Directories + +## Summary + +This is an RFC describing a new storage strategy for storing relation directories. + +## Motivation + +Postgres maintains a directory structure for databases and relations. In Neon, we store these information +by serializing the directory data in a single key (see `pgdatadir_mapping.rs`). + +```rust +// DbDir: +// 00 00000000 00000000 00000000 00 00000000 + +// RelDir: +// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) +``` + +We have a dedicated structure on the ingestion path to serialize the relation directory into this single key. + +```rust +#[derive(Debug, Serialize, Deserialize, Default)] +pub(crate) struct RelDirectory { + // Set of relations that exist. (relfilenode, forknum) + // + // TODO: Store it as a btree or radix tree or something else that spans multiple + // key-value pairs, if you have a lot of relations + pub(crate) rels: HashSet<(Oid, u8)>, +} +``` + +The current codebase has the following three access patterns for the relation directory. + +1. Check if a relation exists. +2. List all relations. +3. Create/drop a relation. + +For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the +hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get +and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back. + +If we have 100k relations in a database, we would have a 100k-large hash set. Then, every +relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the +relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path, +we would have to deserialize this super big 100k-large key before checking if a single relation exists. + +In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how +to seamlessly migrate users to use the new keyspace. + +The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316). + +## Key Mapping + +We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in +[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>` +for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`), +into the key. + +```plain +(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted +(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists +``` + +Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be +implemented as follows. + +1. Check if a relation exists: check if the key maps to "exists". +2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key. +3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will + be removed during image layer generation upon compaction. + +Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum. +The mapping is implemented as `rel_tag_sparse_key` in the PoC patch. + +## Changes to Sparse Keyspace + +Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir +information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs +to be updated accordingly to accommodate such "inherited sparse keys". This is done in +[PR#10313](https://github.com/neondatabase/neon/pull/10313). + +## Coexistence of the Old and New Keyspaces + +Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the +ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read +path needs to combine the data from both keyspaces. + +Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the +new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration +process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the +migration can happen seamlessly and imposes no potential downtime for the user. + +With the coexistence assumption, the 3 reldir operations will be implemented as follows: + +1. Check if a relation exists + - Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly + return it to the user. + - Otherwise, deserialize the old reldir key and get the result. +2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key. + Combine them to obtain the final result. +3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace. + - We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check. + - For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace. + - For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key, + remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace. + - The delete tombstone will be removed during image layer generation upon compaction. + +This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total +amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction. +There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal +with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives +us `O(1)` complexity after fully opt-in the sparse keyspace. + +The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible +to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN. + +We will introduce a config item and an index_part record to record the current status of the migration process. + +- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace. +- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace. + +If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update +`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to +`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still +read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only: +once v2 is enabled, the user cannot go back to v1. + +## Next Steps + +### Full Migration + +This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and +v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this +code path, we must ensure the timeline has no old reldir data. + +We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces: +the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while +copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in +the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers` +process discovers the following keys at this LSN. + +```plain +db1/reldir_key -> (table 1, table 2, table 3) +...db1 rel keys +db2/reldir_key -> (table 4, table 5, table 6) +...db2 rel keys +sparse_reldir_db2_table7 -> exists +sparse_reldir_db1_table8 -> deleted +``` + +It will generate the following keys: + +```plain +db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`. +...db1 rel keys +db2/reldir_key -> () +...db2 rel keys + +-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180 +sparse_reldir_db1_table1 -> exists +sparse_reldir_db1_table2 -> exists +sparse_reldir_db1_table3 -> exists +sparse_reldir_db2_table4 -> exists +sparse_reldir_db2_table5 -> exists +sparse_reldir_db2_table6 -> exists +sparse_reldir_db2_table7 -> exists +-- end image layer for the sparse keyspace at sparse_reldir_prefix+1 + +# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace. +# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there +# are no correctness issue. +``` + +We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before +we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images +above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or +in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to +`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we +don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers. + +The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code. + +### Consolidate Relation Size Keys + +We have relsize at the end of all relation nodes. + +```plain +// RelSize: +// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF +``` + +This means that computing logical size requires us to do several single-key gets across the keyspace, +potentially requiring downloading many layer files. We could consolidate them into a single +keyspace, improving logical size calculation performance. + +### Migrate DBDir Keys + +We assume the number of databases created by the users will be small, and therefore, the current way +of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into +the sparse keyspace to support large amount of databases. diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index f7b0b3a587..094f8d5360 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -1,3 +1,7 @@ +# Neon RFCs + +## Overview + This directory contains Request for Comments documents, or RFCs, for features or concepts that have been proposed. Alternative names: technical design doc, ERD, one-pager @@ -59,37 +63,10 @@ RFC lifecycle: ### RFC template +Use template with `YYYY-MM-DD-copy-me.md` as a starting point. Timestamp prefix helps to avoid awkward 'id' collisions. + +```sh +cp docs/rfcs/YYYY-MM-DD-copy-me.md docs/rfcs/$(date +"%Y-%m-%d")-.md +``` + Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration. - -``` -# Name -Created on .. -Implemented on .. - -## Summary - -## Motivation - -## Non Goals (if relevant) - -## Impacted components (e.g. pageserver, safekeeper, console, etc) - -## Proposed implementation - -### Reliability, failure modes and corner cases (if relevant) - -### Interaction/Sequence diagram (if relevant) - -### Scalability (if relevant) - -### Security implications (if relevant) - -### Unresolved questions (if relevant) - -## Alternative implementation (if relevant) - -## Pros/cons of proposed approaches (if relevant) - -## Definition of Done (if relevant) - -``` diff --git a/docs/rfcs/YYYY-MM-DD-copy-me.md b/docs/rfcs/YYYY-MM-DD-copy-me.md new file mode 100644 index 0000000000..8487861e6b --- /dev/null +++ b/docs/rfcs/YYYY-MM-DD-copy-me.md @@ -0,0 +1,30 @@ +# Name + +Created on YYYY-MM-DD +Implemented on _TBD_ + +## Summary + +## Motivation + +## Non Goals (if relevant) + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +## Proposed implementation + +### Reliability, failure modes and corner cases (if relevant) + +### Interaction/Sequence diagram (if relevant) + +### Scalability (if relevant) + +### Security implications (if relevant) + +### Unresolved questions (if relevant) + +## Alternative implementation (if relevant) + +## Pros/cons of proposed approaches (if relevant) + +## Definition of Done (if relevant) diff --git a/docs/storage_controller.md b/docs/storage_controller.md index 6d2ef929a4..ac4aca4219 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -101,15 +101,25 @@ changes such as a pageserver node becoming unavailable, or the tenant's shard co postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver location changes. -The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires -JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request. +The hook is configured using the storage controller's `--control-plane-url` CLI option, from which the hook URL is computed. -In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems +Currently, there is two hooks, each computed by appending the name to the provided control plane URL prefix: + +- `notify-attach`, called whenever attachment for pageservers changes +- `notify-safekeepers`, called whenever attachment for safekeepers changes + +If the hooks require JWT auth, the token may be provided with `--control-plane-jwt-token`. +The hooks will be invoked with a `PUT` request. + +In the Neon cloud service, these hooks are implemented by Neon's internal cloud control plane. In `neon_local` systems, the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling the compute hook. -When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated: -the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience. +When implementing an on-premise Neon deployment, you must implement a service that handles the compute hooks. This is not complicated. + +### `notify-attach` body + +The `notify-attach` request body follows the format of the `ComputeHookNotifyRequest` structure, provided below for convenience. ``` struct ComputeHookNotifyRequestShard { @@ -128,15 +138,15 @@ When a notification is received: 1. Modify postgres configuration for this tenant: - - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The + - set `neon.pageserver_connstring` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The shards identified by `NodeId` must be converted to the address+port of the node. - - if stripe_size is not None, set `neon.stripe_size` to this value + - if stripe_size is not None, set `neon.shard_stripe_size` to this value 2. Send SIGHUP to postgres to reload configuration 3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller will retry the notification until it succeeds.. -### Example notification body +Example body: ``` { @@ -148,3 +158,34 @@ When a notification is received: ], } ``` + +### `notify-safekeepers` body + +The `notify-safekeepers` request body forllows the format of the `SafekeepersNotifyRequest` structure, provided below for convenience. + +``` +pub struct SafekeeperInfo { + pub id: NodeId, + pub hostname: String, +} + +pub struct SafekeepersNotifyRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub generation: u32, + pub safekeepers: Vec, +} +``` + +When a notification is received: + +1. Modify postgres configuration for this tenant: + + - set `neon.safekeeper_connstrings` to an array of postgres connection strings to safekeepers according to the `safekeepers` list. The + safekeepers identified by `NodeId` must be converted to the address+port of the respective safekeeper. + The hostname is provided for debugging purposes, so we reserve changes to how we pass it. + - set `neon.safekeepers_generation` to the provided `generation` value. + +2. Send SIGHUP to postgres to reload configuration +3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller + will retry the notification until it succeeds.. \ No newline at end of file diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index c11a1b6688..81b0cd19a1 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -1,12 +1,13 @@ [package] name = "compute_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] anyhow.workspace = true chrono.workspace = true +indexmap.workspace = true jsonwebtoken.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 0c256cae2e..d88451c549 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,11 +1,10 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. -use crate::{ - privilege::Privilege, - responses::ComputeCtlConfig, - spec::{ComputeSpec, ExtVersion, PgIdent}, -}; use serde::{Deserialize, Serialize}; +use crate::privilege::Privilege; +use crate::responses::ComputeCtlConfig; +use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; + /// Request of the /configure API /// /// We now pass only `spec` in the configuration request, but later we can @@ -31,3 +30,9 @@ pub struct SetRoleGrantsRequest { pub privileges: Vec, pub role: PgIdent, } + +/// Request of the /configure_telemetry API +#[derive(Debug, Deserialize, Serialize)] +pub struct ConfigureTelemetryRequest { + pub logs_export_host: Option, +} diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a6248019d9..c8f6019c5c 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -6,10 +6,8 @@ use chrono::{DateTime, Utc}; use jsonwebtoken::jwk::JwkSet; use serde::{Deserialize, Serialize, Serializer}; -use crate::{ - privilege::Privilege, - spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}, -}; +use crate::privilege::Privilege; +use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { @@ -136,9 +134,12 @@ pub struct CatalogObjects { pub databases: Vec, } -#[derive(Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct ComputeCtlConfig { + /// Set of JSON web keys that the compute can use to authenticate + /// communication from the control plane. pub jwks: JwkSet, + pub tls: Option, } impl Default for ComputeCtlConfig { @@ -147,10 +148,17 @@ impl Default for ComputeCtlConfig { jwks: JwkSet { keys: Vec::default(), }, + tls: None, } } } +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TlsConfig { + pub key_path: String, + pub cert_path: String, +} + /// Response of the `/computes/{compute_id}/spec` control-plane API. #[derive(Deserialize, Debug)] pub struct ControlPlaneSpecResponse { diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 2e7f501d4b..11615b73a1 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -5,12 +5,14 @@ //! and connect it to the storage nodes. use std::collections::HashMap; +use indexmap::IndexMap; +use regex::Regex; +use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; -use regex::Regex; -use remote_storage::RemotePath; +use crate::responses::TlsConfig; /// String type alias representing Postgres identifier and /// intended to be used for DB / role names. @@ -102,6 +104,17 @@ pub struct ComputeSpec { pub timeline_id: Option, pub pageserver_connstring: Option, + /// Safekeeper membership config generation. It is put in + /// neon.safekeepers GUC and serves two purposes: + /// 1) Non zero value forces walproposer to use membership configurations. + /// 2) If walproposer wants to update list of safekeepers to connect to + /// taking them from some safekeeper mconf, it should check what value + /// is newer by comparing the generation. + /// + /// Note: it could be SafekeeperGeneration, but this needs linking + /// compute_ctl with postgres_ffi. + #[serde(default)] + pub safekeepers_generation: Option, #[serde(default)] pub safekeeper_connstrings: Vec, @@ -115,7 +128,7 @@ pub struct ComputeSpec { // information about available remote extensions pub remote_extensions: Option, - pub pgbouncer_settings: Option>, + pub pgbouncer_settings: Option>, // Stripe size for pageserver sharding, in pages #[serde(default)] @@ -145,6 +158,16 @@ pub struct ComputeSpec { /// over the same replication content from publisher. #[serde(default)] // Default false pub drop_subscriptions_before_start: bool, + + /// Log level for audit logging: + /// + /// Disabled - no audit logging. This is the default. + /// log - log masked statements to the postgres log using pgaudit extension + /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension + /// + /// Extensions should be present in shared_preload_libraries + #[serde(default)] + pub audit_log_level: ComputeAudit, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -156,6 +179,9 @@ pub enum ComputeFeature { /// track short-lived connections as user activity. ActivityMonitorExperimental, + /// Allow to configure rsyslog for Postgres logs export + PostgresLogsExport, + /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test /// `parse_unknown_features()` for more details. @@ -249,6 +275,17 @@ pub enum ComputeMode { Replica, } +/// Log level for audit logging +/// Disabled, log, hipaa +/// Default is Disabled +#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +pub enum ComputeAudit { + #[default] + Disabled, + Log, + Hipaa, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub struct Cluster { pub cluster_id: Option, @@ -323,6 +360,9 @@ pub struct LocalProxySpec { #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub jwks: Option>, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub tls: Option, } #[derive(Clone, Debug, Deserialize, Serialize)] @@ -336,9 +376,10 @@ pub struct JwksSettings { #[cfg(test)] mod tests { - use super::*; use std::fs::File; + use super::*; + #[test] fn allow_installing_remote_extensions() { let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index 0e517e3856..77f130950e 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "consumption_metrics" version = "0.1.0" -edition = "2021" +edition = "2024" license = "Apache-2.0" [dependencies] diff --git a/libs/desim/src/chan.rs b/libs/desim/src/chan.rs index 6661d59871..8882cd3b56 100644 --- a/libs/desim/src/chan.rs +++ b/libs/desim/src/chan.rs @@ -1,4 +1,5 @@ -use std::{collections::VecDeque, sync::Arc}; +use std::collections::VecDeque; +use std::sync::Arc; use parking_lot::{Mutex, MutexGuard}; diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs index 9d44bd7741..df8b071c06 100644 --- a/libs/desim/src/executor.rs +++ b/libs/desim/src/executor.rs @@ -1,11 +1,7 @@ -use std::{ - panic::AssertUnwindSafe, - sync::{ - atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering}, - mpsc, Arc, OnceLock, - }, - thread::JoinHandle, -}; +use std::panic::AssertUnwindSafe; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU32, Ordering}; +use std::sync::{Arc, OnceLock, mpsc}; +use std::thread::JoinHandle; use tracing::{debug, error, trace}; diff --git a/libs/desim/src/network.rs b/libs/desim/src/network.rs index e15a714daa..cf096dba80 100644 --- a/libs/desim/src/network.rs +++ b/libs/desim/src/network.rs @@ -1,26 +1,19 @@ -use std::{ - cmp::Ordering, - collections::{BinaryHeap, VecDeque}, - fmt::{self, Debug}, - ops::DerefMut, - sync::{mpsc, Arc}, -}; +use std::cmp::Ordering; +use std::collections::{BinaryHeap, VecDeque}; +use std::fmt::{self, Debug}; +use std::ops::DerefMut; +use std::sync::{Arc, mpsc}; -use parking_lot::{ - lock_api::{MappedMutexGuard, MutexGuard}, - Mutex, RawMutex, -}; +use parking_lot::lock_api::{MappedMutexGuard, MutexGuard}; +use parking_lot::{Mutex, RawMutex}; use rand::rngs::StdRng; use tracing::debug; -use crate::{ - executor::{self, ThreadContext}, - options::NetworkOptions, - proto::NetEvent, - proto::NodeEvent, -}; - -use super::{chan::Chan, proto::AnyMessage}; +use super::chan::Chan; +use super::proto::AnyMessage; +use crate::executor::{self, ThreadContext}; +use crate::options::NetworkOptions; +use crate::proto::{NetEvent, NodeEvent}; pub struct NetworkTask { options: Arc, diff --git a/libs/desim/src/node_os.rs b/libs/desim/src/node_os.rs index 7744a9f5e1..e0cde7b284 100644 --- a/libs/desim/src/node_os.rs +++ b/libs/desim/src/node_os.rs @@ -2,14 +2,11 @@ use std::sync::Arc; use rand::Rng; +use super::chan::Chan; +use super::network::TCP; +use super::world::{Node, NodeId, World}; use crate::proto::NodeEvent; -use super::{ - chan::Chan, - network::TCP, - world::{Node, NodeId, World}, -}; - /// Abstraction with all functions (aka syscalls) available to the node. #[derive(Clone)] pub struct NodeOs { diff --git a/libs/desim/src/options.rs b/libs/desim/src/options.rs index 5da7c2c482..9b1a42fd28 100644 --- a/libs/desim/src/options.rs +++ b/libs/desim/src/options.rs @@ -1,4 +1,5 @@ -use rand::{rngs::StdRng, Rng}; +use rand::Rng; +use rand::rngs::StdRng; /// Describes random delays and failures. Delay will be uniformly distributed in [min, max]. /// Connection failure will occur with the probablity fail_prob. diff --git a/libs/desim/src/proto.rs b/libs/desim/src/proto.rs index 92a7e8a27d..31bc29e6a6 100644 --- a/libs/desim/src/proto.rs +++ b/libs/desim/src/proto.rs @@ -3,7 +3,8 @@ use std::fmt::Debug; use bytes::Bytes; use utils::lsn::Lsn; -use crate::{network::TCP, world::NodeId}; +use crate::network::TCP; +use crate::world::NodeId; /// Internal node events. #[derive(Debug)] diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs index 7ce605bda8..350d182cc3 100644 --- a/libs/desim/src/time.rs +++ b/libs/desim/src/time.rs @@ -1,12 +1,8 @@ -use std::{ - cmp::Ordering, - collections::BinaryHeap, - ops::DerefMut, - sync::{ - atomic::{AtomicU32, AtomicU64}, - Arc, - }, -}; +use std::cmp::Ordering; +use std::collections::BinaryHeap; +use std::ops::DerefMut; +use std::sync::Arc; +use std::sync::atomic::{AtomicU32, AtomicU64}; use parking_lot::Mutex; use tracing::trace; diff --git a/libs/desim/src/world.rs b/libs/desim/src/world.rs index 7d60be04b5..576ba89cd7 100644 --- a/libs/desim/src/world.rs +++ b/libs/desim/src/world.rs @@ -1,19 +1,18 @@ +use std::ops::DerefMut; +use std::sync::{Arc, mpsc}; + use parking_lot::Mutex; -use rand::{rngs::StdRng, SeedableRng}; -use std::{ - ops::DerefMut, - sync::{mpsc, Arc}, -}; +use rand::SeedableRng; +use rand::rngs::StdRng; -use crate::{ - executor::{ExternalHandle, Runtime}, - network::NetworkTask, - options::NetworkOptions, - proto::{NodeEvent, SimEvent}, - time::Timing, -}; - -use super::{chan::Chan, network::TCP, node_os::NodeOs}; +use super::chan::Chan; +use super::network::TCP; +use super::node_os::NodeOs; +use crate::executor::{ExternalHandle, Runtime}; +use crate::network::NetworkTask; +use crate::options::NetworkOptions; +use crate::proto::{NodeEvent, SimEvent}; +use crate::time::Timing; pub type NodeId = u32; diff --git a/libs/desim/tests/reliable_copy_test.rs b/libs/desim/tests/reliable_copy_test.rs index cf7bff8f5a..1ddf9844de 100644 --- a/libs/desim/tests/reliable_copy_test.rs +++ b/libs/desim/tests/reliable_copy_test.rs @@ -1,14 +1,15 @@ //! Simple test to verify that simulator is working. #[cfg(test)] mod reliable_copy_test { + use std::sync::Arc; + use anyhow::Result; use desim::executor::{self, PollSome}; + use desim::node_os::NodeOs; use desim::options::{Delay, NetworkOptions}; - use desim::proto::{NetEvent, NodeEvent, ReplCell}; + use desim::proto::{AnyMessage, NetEvent, NodeEvent, ReplCell}; use desim::world::{NodeId, World}; - use desim::{node_os::NodeOs, proto::AnyMessage}; use parking_lot::Mutex; - use std::sync::Arc; use tracing::info; /// Disk storage trait and implementation. diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml index d72e4bd012..331ae4a9b8 100644 --- a/libs/http-utils/Cargo.toml +++ b/libs/http-utils/Cargo.toml @@ -6,11 +6,10 @@ license.workspace = true [dependencies] anyhow.workspace = true -backtrace.workspace = true bytes.workspace = true -inferno.workspace = true +camino.workspace = true fail.workspace = true -flate2.workspace = true +futures.workspace = true hyper0.workspace = true itertools.workspace = true jemalloc_pprof.workspace = true @@ -18,12 +17,14 @@ once_cell.workspace = true pprof.workspace = true regex.workspace = true routerify.workspace = true +rustls-pemfile.workspace = true serde.workspace = true serde_json.workspace = true serde_path_to_error.workspace = true thiserror.workspace = true tracing.workspace = true tokio.workspace = true +tokio-rustls.workspace = true tokio-util.workspace = true url.workspace = true uuid.workspace = true diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs index be97b341d1..5588f6d87e 100644 --- a/libs/http-utils/src/endpoint.rs +++ b/libs/http-utils/src/endpoint.rs @@ -1,30 +1,28 @@ -use crate::error::{api_error_handler, route_error_handler, ApiError}; -use crate::pprof; -use crate::request::{get_query_param, parse_query_param}; -use ::pprof::protos::Message as _; -use ::pprof::ProfilerGuardBuilder; -use anyhow::{anyhow, Context}; -use bytes::{Bytes, BytesMut}; -use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION}; -use hyper::http::HeaderValue; -use hyper::Method; -use hyper::{header::CONTENT_TYPE, Body, Request, Response}; -use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; -use once_cell::sync::Lazy; -use regex::Regex; -use routerify::ext::RequestExt; -use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; -use tokio::sync::{mpsc, Mutex, Notify}; -use tokio_stream::wrappers::ReceiverStream; -use tokio_util::io::ReaderStream; -use tracing::{debug, info, info_span, warn, Instrument}; -use utils::auth::{AuthError, Claims, SwappableJwtAuth}; - use std::future::Future; use std::io::Write as _; use std::str::FromStr; use std::time::Duration; +use anyhow::{Context, anyhow}; +use bytes::{Bytes, BytesMut}; +use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName}; +use hyper::http::HeaderValue; +use hyper::{Body, Method, Request, Response}; +use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter}; +use once_cell::sync::Lazy; +use pprof::ProfilerGuardBuilder; +use pprof::protos::Message as _; +use routerify::ext::RequestExt; +use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; +use tokio::sync::{Mutex, Notify, mpsc}; +use tokio_stream::wrappers::ReceiverStream; +use tokio_util::io::ReaderStream; +use tracing::{Instrument, debug, info, info_span, warn}; +use utils::auth::{AuthError, Claims, SwappableJwtAuth}; + +use crate::error::{ApiError, api_error_handler, route_error_handler}; +use crate::request::{get_query_param, parse_query_param}; + static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { register_int_counter!( "libmetrics_metric_handler_requests_total", @@ -375,7 +373,7 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A Err(_) => { return Err(ApiError::Conflict( "profiler already running (use ?force=true to cancel it)".into(), - )) + )); } } tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait @@ -401,12 +399,10 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A // Return the report in the requested format. match format { Format::Pprof => { - let mut body = Vec::new(); - report + let body = report .pprof() .map_err(|err| ApiError::InternalServerError(err.into()))? - .write_to_vec(&mut body) - .map_err(|err| ApiError::InternalServerError(err.into()))?; + .encode_to_vec(); Response::builder() .status(200) @@ -449,20 +445,6 @@ pub async fn profile_heap_handler(req: Request) -> Result, Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), }; - // Functions and mappings to strip when symbolizing pprof profiles. If true, - // also remove child frames. - static STRIP_FUNCTIONS: Lazy> = Lazy::new(|| { - vec![ - (Regex::new("^__rust").unwrap(), false), - (Regex::new("^_start$").unwrap(), false), - (Regex::new("^irallocx_prof").unwrap(), true), - (Regex::new("^prof_alloc_prep").unwrap(), true), - (Regex::new("^std::rt::lang_start").unwrap(), false), - (Regex::new("^std::sys::backtrace::__rust").unwrap(), false), - ] - }); - const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"]; - // Obtain profiler handle. let mut prof_ctl = jemalloc_pprof::PROF_CTL .as_ref() @@ -495,52 +477,34 @@ pub async fn profile_heap_handler(req: Request) -> Result, } Format::Pprof => { - let data = tokio::task::spawn_blocking(move || { - let bytes = prof_ctl.dump_pprof()?; - // Symbolize the profile. - // TODO: consider moving this upstream to jemalloc_pprof and avoiding the - // serialization roundtrip. - let profile = pprof::decode(&bytes)?; - let profile = pprof::symbolize(profile)?; - let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); - pprof::encode(&profile) - }) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(ApiError::InternalServerError)?; + let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "application/octet-stream") - .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"") + .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"") .body(Body::from(data)) .map_err(|err| ApiError::InternalServerError(err.into())) } Format::Svg => { - let body = tokio::task::spawn_blocking(move || { - let bytes = prof_ctl.dump_pprof()?; - let profile = pprof::decode(&bytes)?; - let profile = pprof::symbolize(profile)?; - let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); - let mut opts = inferno::flamegraph::Options::default(); - opts.title = "Heap inuse".to_string(); - opts.count_name = "bytes".to_string(); - pprof::flamegraph(profile, &mut opts) - }) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(ApiError::InternalServerError)?; + let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "image/svg+xml") - .body(Body::from(body)) + .body(Body::from(svg)) .map_err(|err| ApiError::InternalServerError(err.into())) } } } -pub fn add_request_id_middleware( -) -> Middleware { +pub fn add_request_id_middleware() +-> Middleware { Middleware::pre(move |req| async move { let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) { Some(request_id) => request_id @@ -664,7 +628,7 @@ pub fn auth_middleware( None => { return Err(ApiError::Unauthorized( "missing authorization header".to_string(), - )) + )); } } } @@ -717,12 +681,14 @@ pub fn check_permission_with( #[cfg(test)] mod tests { - use super::*; - use hyper::service::Service; - use routerify::RequestServiceBuilder; use std::future::poll_fn; use std::net::{IpAddr, SocketAddr}; + use hyper::service::Service; + use routerify::RequestServiceBuilder; + + use super::*; + #[tokio::test] async fn test_request_id_returned() { let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap(); diff --git a/libs/http-utils/src/error.rs b/libs/http-utils/src/error.rs index 746305caec..f790dc26ca 100644 --- a/libs/http-utils/src/error.rs +++ b/libs/http-utils/src/error.rs @@ -1,10 +1,10 @@ -use hyper::{header, Body, Response, StatusCode}; -use serde::{Deserialize, Serialize}; use std::borrow::Cow; use std::error::Error as StdError; + +use hyper::{Body, Response, StatusCode, header}; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tracing::{error, info, warn}; - use utils::auth::AuthError; #[derive(Debug, Error)] diff --git a/libs/http-utils/src/failpoints.rs b/libs/http-utils/src/failpoints.rs index 8a1e0c8cf0..984823f4a9 100644 --- a/libs/http-utils/src/failpoints.rs +++ b/libs/http-utils/src/failpoints.rs @@ -1,12 +1,11 @@ -use crate::error::ApiError; -use crate::json::{json_request, json_response}; - use hyper::{Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; - use utils::failpoint_support::apply_failpoint; +use crate::error::ApiError; +use crate::json::{json_request, json_response}; + pub type ConfigureFailpointsRequest = Vec; /// Information for configuring a single fail point diff --git a/libs/http-utils/src/json.rs b/libs/http-utils/src/json.rs index e53231f313..14ebac91e6 100644 --- a/libs/http-utils/src/json.rs +++ b/libs/http-utils/src/json.rs @@ -1,6 +1,6 @@ use anyhow::Context; use bytes::Buf; -use hyper::{header, Body, Request, Response, StatusCode}; +use hyper::{Body, Request, Response, StatusCode, header}; use serde::{Deserialize, Serialize}; use super::error::ApiError; diff --git a/libs/http-utils/src/lib.rs b/libs/http-utils/src/lib.rs index ae6a27aaa8..2bd0fe582f 100644 --- a/libs/http-utils/src/lib.rs +++ b/libs/http-utils/src/lib.rs @@ -2,11 +2,12 @@ pub mod endpoint; pub mod error; pub mod failpoints; pub mod json; -pub mod pprof; pub mod request; +pub mod server; +pub mod tls_certs; extern crate hyper0 as hyper; /// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. -pub use routerify::{ext::RequestExt, RouterBuilder, RouterService}; +pub use routerify::{RequestServiceBuilder, RouterBuilder, RouterService, ext::RequestExt}; diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs deleted file mode 100644 index fe1cc10838..0000000000 --- a/libs/http-utils/src/pprof.rs +++ /dev/null @@ -1,238 +0,0 @@ -use anyhow::bail; -use flate2::write::{GzDecoder, GzEncoder}; -use flate2::Compression; -use itertools::Itertools as _; -use pprof::protos::{Function, Line, Location, Message as _, Profile}; -use regex::Regex; - -use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; -use std::ffi::c_void; -use std::io::Write as _; - -/// Decodes a gzip-compressed Protobuf-encoded pprof profile. -pub fn decode(bytes: &[u8]) -> anyhow::Result { - let mut gz = GzDecoder::new(Vec::new()); - gz.write_all(bytes)?; - Ok(Profile::parse_from_bytes(&gz.finish()?)?) -} - -/// Encodes a pprof profile as gzip-compressed Protobuf. -pub fn encode(profile: &Profile) -> anyhow::Result> { - let mut gz = GzEncoder::new(Vec::new(), Compression::default()); - profile.write_to_writer(&mut gz)?; - Ok(gz.finish()?) -} - -/// Symbolizes a pprof profile using the current binary. -pub fn symbolize(mut profile: Profile) -> anyhow::Result { - if !profile.function.is_empty() { - return Ok(profile); // already symbolized - } - - // Collect function names. - let mut functions: HashMap = HashMap::new(); - let mut strings: HashMap = profile - .string_table - .into_iter() - .enumerate() - .map(|(i, s)| (s, i as i64)) - .collect(); - - // Helper to look up or register a string. - let mut string_id = |s: &str| -> i64 { - // Don't use .entry() to avoid unnecessary allocations. - if let Some(id) = strings.get(s) { - return *id; - } - let id = strings.len() as i64; - strings.insert(s.to_string(), id); - id - }; - - for loc in &mut profile.location { - if !loc.line.is_empty() { - continue; - } - - // Resolve the line and function for each location. - backtrace::resolve(loc.address as *mut c_void, |symbol| { - let Some(symbol_name) = symbol.name() else { - return; - }; - - let function_name = format!("{symbol_name:#}"); - let functions_len = functions.len(); - let function_id = functions - .entry(function_name) - .or_insert_with_key(|function_name| { - let function_id = functions_len as u64 + 1; - let system_name = String::from_utf8_lossy(symbol_name.as_bytes()); - let filename = symbol - .filename() - .map(|path| path.to_string_lossy()) - .unwrap_or(Cow::Borrowed("")); - Function { - id: function_id, - name: string_id(function_name), - system_name: string_id(&system_name), - filename: string_id(&filename), - ..Default::default() - } - }) - .id; - loc.line.push(Line { - function_id, - line: symbol.lineno().unwrap_or(0) as i64, - ..Default::default() - }); - }); - } - - // Store the resolved functions, and mark the mapping as resolved. - profile.function = functions.into_values().sorted_by_key(|f| f.id).collect(); - profile.string_table = strings - .into_iter() - .sorted_by_key(|(_, i)| *i) - .map(|(s, _)| s) - .collect(); - - for mapping in &mut profile.mapping { - mapping.has_functions = true; - mapping.has_filenames = true; - } - - Ok(profile) -} - -/// Strips locations (stack frames) matching the given mappings (substring) or function names -/// (regex). The function bool specifies whether child frames should be stripped as well. -/// -/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all -/// string references. -pub fn strip_locations( - mut profile: Profile, - mappings: &[&str], - functions: &[(Regex, bool)], -) -> Profile { - // Strip mappings. - let mut strip_mappings: HashSet = HashSet::new(); - - profile.mapping.retain(|mapping| { - let Some(name) = profile.string_table.get(mapping.filename as usize) else { - return true; - }; - if mappings.iter().any(|substr| name.contains(substr)) { - strip_mappings.insert(mapping.id); - return false; - } - true - }); - - // Strip functions. - let mut strip_functions: HashMap = HashMap::new(); - - profile.function.retain(|function| { - let Some(name) = profile.string_table.get(function.name as usize) else { - return true; - }; - for (regex, strip_children) in functions { - if regex.is_match(name) { - strip_functions.insert(function.id, *strip_children); - return false; - } - } - true - }); - - // Strip locations. The bool specifies whether child frames should be stripped too. - let mut strip_locations: HashMap = HashMap::new(); - - profile.location.retain(|location| { - for line in &location.line { - if let Some(strip_children) = strip_functions.get(&line.function_id) { - strip_locations.insert(location.id, *strip_children); - return false; - } - } - if strip_mappings.contains(&location.mapping_id) { - strip_locations.insert(location.id, false); - return false; - } - true - }); - - // Strip sample locations. - for sample in &mut profile.sample { - // First, find the uppermost function with child removal and truncate the stack. - if let Some(truncate) = sample - .location_id - .iter() - .rposition(|id| strip_locations.get(id) == Some(&true)) - { - sample.location_id.drain(..=truncate); - } - // Next, strip any individual frames without child removal. - sample - .location_id - .retain(|id| !strip_locations.contains_key(id)); - } - - profile -} - -/// Generates an SVG flamegraph from a symbolized pprof profile. -pub fn flamegraph( - profile: Profile, - opts: &mut inferno::flamegraph::Options, -) -> anyhow::Result> { - if profile.mapping.iter().any(|m| !m.has_functions) { - bail!("profile not symbolized"); - } - - // Index locations, functions, and strings. - let locations: HashMap = - profile.location.into_iter().map(|l| (l.id, l)).collect(); - let functions: HashMap = - profile.function.into_iter().map(|f| (f.id, f)).collect(); - let strings = profile.string_table; - - // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack, - // since inferno expects it bottom-up. - let mut stacks: HashMap, i64> = HashMap::new(); - for sample in profile.sample { - let mut stack = Vec::with_capacity(sample.location_id.len()); - for location in sample.location_id.into_iter().rev() { - let Some(location) = locations.get(&location) else { - bail!("missing location {location}"); - }; - for line in location.line.iter().rev() { - let Some(function) = functions.get(&line.function_id) else { - bail!("missing function {}", line.function_id); - }; - let Some(name) = strings.get(function.name as usize) else { - bail!("missing string {}", function.name); - }; - stack.push(name.as_str()); - } - } - let Some(&value) = sample.value.first() else { - bail!("missing value"); - }; - *stacks.entry(stack).or_default() += value; - } - - // Construct stack lines for inferno. - let lines = stacks - .into_iter() - .map(|(stack, value)| (stack.into_iter().join(";"), value)) - .map(|(stack, value)| format!("{stack} {value}")) - .sorted() - .collect_vec(); - - // Construct the flamegraph. - let mut bytes = Vec::new(); - let lines = lines.iter().map(|line| line.as_str()); - inferno::flamegraph::from_lines(opts, lines, &mut bytes)?; - Ok(bytes) -} diff --git a/libs/http-utils/src/request.rs b/libs/http-utils/src/request.rs index 7ea71685ec..9024a90a82 100644 --- a/libs/http-utils/src/request.rs +++ b/libs/http-utils/src/request.rs @@ -1,10 +1,13 @@ use core::fmt; -use std::{borrow::Cow, str::FromStr}; +use std::borrow::Cow; +use std::str::FromStr; + +use anyhow::anyhow; +use hyper::body::HttpBody; +use hyper::{Body, Request}; +use routerify::ext::RequestExt; use super::error::ApiError; -use anyhow::anyhow; -use hyper::{body::HttpBody, Body, Request}; -use routerify::ext::RequestExt; pub fn get_request_param<'a>( request: &'a Request, diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs new file mode 100644 index 0000000000..33e4915e99 --- /dev/null +++ b/libs/http-utils/src/server.rs @@ -0,0 +1,155 @@ +use std::{error::Error, sync::Arc}; + +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use hyper0::Body; +use hyper0::server::conn::Http; +use routerify::{RequestService, RequestServiceBuilder}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; +use tracing::{error, info}; + +use crate::error::ApiError; + +/// A simple HTTP server over hyper library. +/// You may want to use it instead of [`hyper0::server::Server`] because: +/// 1. hyper0's Server was removed from hyper v1. +/// It's recommended to replace hyepr0's Server with a manual loop, which is done here. +/// 2. hyper0's Server doesn't support TLS out of the box, and there is no way +/// to support it efficiently with the Accept trait that hyper0's Server uses. +/// That's one of the reasons why it was removed from v1. +/// +pub struct Server { + request_service: Arc>, + listener: tokio::net::TcpListener, + tls_acceptor: Option, +} + +impl Server { + pub fn new( + request_service: Arc>, + listener: std::net::TcpListener, + tls_acceptor: Option, + ) -> anyhow::Result { + // Note: caller of from_std is responsible for setting nonblocking mode. + listener.set_nonblocking(true)?; + let listener = tokio::net::TcpListener::from_std(listener)?; + + Ok(Self { + request_service, + listener, + tls_acceptor, + }) + } + + pub async fn serve(self, cancel: CancellationToken) -> anyhow::Result<()> { + fn suppress_io_error(err: &std::io::Error) -> bool { + use std::io::ErrorKind::*; + matches!(err.kind(), ConnectionReset | ConnectionAborted | BrokenPipe) + } + fn suppress_hyper_error(err: &hyper0::Error) -> bool { + if err.is_incomplete_message() || err.is_closed() || err.is_timeout() { + return true; + } + if let Some(inner) = err.source() { + if let Some(io) = inner.downcast_ref::() { + return suppress_io_error(io); + } + } + false + } + + let mut connections = FuturesUnordered::new(); + loop { + tokio::select! { + stream = self.listener.accept() => { + let (tcp_stream, remote_addr) = match stream { + Ok(stream) => stream, + Err(err) => { + if !suppress_io_error(&err) { + info!("Failed to accept TCP connection: {err:#}"); + } + continue; + } + }; + + let service = self.request_service.build(remote_addr); + let tls_acceptor = self.tls_acceptor.clone(); + let cancel = cancel.clone(); + + connections.push(tokio::spawn( + async move { + match tls_acceptor { + Some(tls_acceptor) => { + // Handle HTTPS connection. + let tls_stream = tokio::select! { + tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream, + _ = cancel.cancelled() => return, + }; + let tls_stream = match tls_stream { + Ok(tls_stream) => tls_stream, + Err(err) => { + if !suppress_io_error(&err) { + info!("Failed to accept TLS connection: {err:#}"); + } + return; + } + }; + if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await { + if !suppress_hyper_error(&err) { + info!("Failed to serve HTTPS connection: {err:#}"); + } + } + } + None => { + // Handle HTTP connection. + if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await { + if !suppress_hyper_error(&err) { + info!("Failed to serve HTTP connection: {err:#}"); + } + } + } + }; + })); + } + Some(conn) = connections.next() => { + if let Err(err) = conn { + error!("Connection panicked: {err:#}"); + } + } + _ = cancel.cancelled() => { + // Wait for graceful shutdown of all connections. + while let Some(conn) = connections.next().await { + if let Err(err) = conn { + error!("Connection panicked: {err:#}"); + } + } + break; + } + } + } + Ok(()) + } + + /// Serves HTTP connection with graceful shutdown. + async fn serve_connection( + io: I, + service: RequestService, + cancel: CancellationToken, + ) -> Result<(), hyper0::Error> + where + I: AsyncRead + AsyncWrite + Unpin + Send + 'static, + { + let mut conn = Http::new().serve_connection(io, service).with_upgrades(); + + tokio::select! { + res = &mut conn => res, + _ = cancel.cancelled() => { + Pin::new(&mut conn).graceful_shutdown(); + // Note: connection should still be awaited for graceful shutdown to complete. + conn.await + } + } + } +} diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs new file mode 100644 index 0000000000..db9ec825ed --- /dev/null +++ b/libs/http-utils/src/tls_certs.rs @@ -0,0 +1,21 @@ +use camino::Utf8Path; +use tokio_rustls::rustls::pki_types::{CertificateDer, PrivateKeyDer}; + +pub fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result>> { + let file = std::fs::File::open(filename)?; + let mut reader = std::io::BufReader::new(file); + + Ok(rustls_pemfile::certs(&mut reader).collect::, _>>()?) +} + +pub fn load_private_key(filename: &Utf8Path) -> anyhow::Result> { + let file = std::fs::File::open(filename)?; + let mut reader = std::io::BufReader::new(file); + + let key = rustls_pemfile::private_key(&mut reader)?; + + key.ok_or(anyhow::anyhow!( + "no private key found in {}", + filename.as_str(), + )) +} diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index 723916a742..93f6a2b7cc 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -6,17 +6,15 @@ //! Probabilistic cardinality estimators, such as the HyperLogLog algorithm, //! use significantly less memory than this, but can only approximate the cardinality. -use std::{ - hash::{BuildHasher, BuildHasherDefault, Hash}, - sync::atomic::AtomicU8, -}; +use std::hash::{BuildHasher, BuildHasherDefault, Hash}; +use std::sync::atomic::AtomicU8; -use measured::{ - label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, - metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec}, - text::TextEncoder, - LabelGroup, -}; +use measured::LabelGroup; +use measured::label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}; +use measured::metric::counter::CounterState; +use measured::metric::name::MetricNameEncoder; +use measured::metric::{Metric, MetricType, MetricVec}; +use measured::text::TextEncoder; use twox_hash::xxh3; /// Create an [`HyperLogLogVec`] and registers to default registry. @@ -27,9 +25,7 @@ macro_rules! register_hll_vec { $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec) }}; - ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ - $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) - }}; + ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) }}; } /// Create an [`HyperLogLog`] and registers to default registry. @@ -40,9 +36,7 @@ macro_rules! register_hll { $crate::register(Box::new(hll.clone())).map(|_| hll) }}; - ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ - $crate::register_hll!($N, $crate::opts!($NAME, $HELP)) - }}; + ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ $crate::register_hll!($N, $crate::opts!($NAME, $HELP)) }}; } /// HLL is a probabilistic cardinality measure. @@ -195,8 +189,10 @@ impl measured::metric::MetricEncoding); diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 0f6c2a0937..4df8d7bc51 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,38 +4,26 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] -use measured::{ - label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}, - metric::{ - counter::CounterState, - gauge::GaugeState, - group::Encoding, - name::{MetricName, MetricNameEncoder}, - MetricEncoding, MetricFamilyEncoding, - }, - FixedCardinalityLabel, LabelGroup, MetricGroup, -}; +use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}; +use measured::metric::counter::CounterState; +use measured::metric::gauge::GaugeState; +use measured::metric::group::Encoding; +use measured::metric::name::{MetricName, MetricNameEncoder}; +use measured::metric::{MetricEncoding, MetricFamilyEncoding}; +use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup}; use once_cell::sync::Lazy; +use prometheus::Registry; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, }; pub use prometheus::local::LocalHistogram; -pub use prometheus::opts; -pub use prometheus::register; -pub use prometheus::Error; -use prometheus::Registry; -pub use prometheus::{core, default_registry, proto}; -pub use prometheus::{exponential_buckets, linear_buckets}; -pub use prometheus::{register_counter_vec, Counter, CounterVec}; -pub use prometheus::{register_gauge, Gauge}; -pub use prometheus::{register_gauge_vec, GaugeVec}; -pub use prometheus::{register_histogram, Histogram}; -pub use prometheus::{register_histogram_vec, HistogramVec}; -pub use prometheus::{register_int_counter, IntCounter}; -pub use prometheus::{register_int_counter_vec, IntCounterVec}; -pub use prometheus::{register_int_gauge, IntGauge}; -pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; -pub use prometheus::{Encoder, TextEncoder}; +pub use prometheus::{ + Counter, CounterVec, Encoder, Error, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, + IntCounterVec, IntGauge, IntGaugeVec, TextEncoder, core, default_registry, exponential_buckets, + linear_buckets, opts, proto, register, register_counter_vec, register_gauge, + register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, +}; pub mod launch_timestamp; mod wrappers; diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 79da05da6c..87dfdfb5ec 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "pageserver_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 1aff5a7012..b12ef65780 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -9,19 +9,18 @@ pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +use std::collections::HashMap; +use std::num::{NonZeroU64, NonZeroUsize}; +use std::str::FromStr; +use std::time::Duration; + use postgres_backend::AuthType; use remote_storage::RemoteStorageConfig; use serde_with::serde_as; -use std::{ - collections::HashMap, - num::{NonZeroU64, NonZeroUsize}, - str::FromStr, - time::Duration, -}; -use utils::{logging::LogFormat, postgres_client::PostgresClientProtocol}; +use utils::logging::LogFormat; +use utils::postgres_client::PostgresClientProtocol; -use crate::models::ImageCompressionAlgorithm; -use crate::models::LsnLease; +use crate::models::{ImageCompressionAlgorithm, LsnLease}; // Certain metadata (e.g. externally-addressable name, AZ) is delivered // as a separate structure. This information is not neeed by the pageserver @@ -36,6 +35,7 @@ pub struct NodeMetadata { pub postgres_port: u16, pub http_host: String, pub http_port: u16, + pub https_port: Option, // Deployment tools may write fields to the metadata file beyond what we // use in this type: this type intentionally only names fields that require. @@ -58,6 +58,9 @@ pub struct ConfigToml { // types mapped 1:1 into the runtime PageServerConfig type pub listen_pg_addr: String, pub listen_http_addr: String, + pub listen_https_addr: Option, + pub ssl_key_file: Utf8PathBuf, + pub ssl_cert_file: Utf8PathBuf, pub availability_zone: Option, #[serde(with = "humantime_serde")] pub wait_lsn_timeout: Duration, @@ -124,6 +127,10 @@ pub struct ConfigToml { pub enable_read_path_debugging: Option, #[serde(skip_serializing_if = "Option::is_none")] pub validate_wal_contiguity: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub load_previous_heatmap: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub generate_unarchival_heatmap: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -265,15 +272,16 @@ pub struct TenantConfigToml { /// size exceeds `compaction_upper_limit * checkpoint_distance`. pub compaction_upper_limit: usize, pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, - /// If true, compact down L0 across all tenant timelines before doing regular compaction. + /// If true, compact down L0 across all tenant timelines before doing regular compaction. L0 + /// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true. pub compaction_l0_first: bool, /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only - /// has an effect if `compaction_l0_first` is `true`. + /// has an effect if `compaction_l0_first` is true. Defaults to true. pub compaction_l0_semaphore: bool, - /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure, - /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer - /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification - /// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default. + /// Level0 delta layer threshold at which to delay layer flushes such that they take 2x as long, + /// and block on layer flushes during ephemeral layer rolls, for compaction backpressure. This + /// helps compaction keep up with WAL ingestion, and avoids read amplification blowing up. + /// Should be >compaction_threshold. 0 to disable. Defaults to 3x compaction_threshold. pub l0_flush_delay_threshold: Option, /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold /// to avoid deadlock. 0 to disable. Disabled by default. @@ -281,6 +289,8 @@ pub struct TenantConfigToml { /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next /// layer. This is a temporary backpressure mechanism which should be removed once /// l0_flush_{delay,stall}_threshold is fully enabled. + /// + /// TODO: this is no longer enabled, remove it when the config option is no longer set. pub l0_flush_wait_upload: bool, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. @@ -367,10 +377,10 @@ pub struct TenantConfigToml { } pub mod defaults { - use crate::models::ImageCompressionAlgorithm; - pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; + use crate::models::ImageCompressionAlgorithm; + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; @@ -418,6 +428,9 @@ pub mod defaults { pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol = utils::postgres_client::PostgresClientProtocol::Vanilla; + + pub const DEFAULT_SSL_KEY_FILE: &str = "server.key"; + pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt"; } impl Default for ConfigToml { @@ -427,6 +440,9 @@ impl Default for ConfigToml { Self { listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()), listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()), + listen_https_addr: (None), + ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE), + ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE), availability_zone: (None), wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) .expect("cannot parse default wait lsn timeout")), @@ -524,6 +540,8 @@ impl Default for ConfigToml { None }, validate_wal_contiguity: None, + load_previous_heatmap: None, + generate_unarchival_heatmap: None, } } } @@ -552,13 +570,15 @@ pub mod tenant_conf_defaults { // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So // with this config, we can get a maximum peak compaction usage of 9 GB. pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20; - pub const DEFAULT_COMPACTION_L0_FIRST: bool = false; + // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid + // read amp. + pub const DEFAULT_COMPACTION_L0_FIRST: bool = true; pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true; pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; - pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true; + pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; @@ -569,9 +589,8 @@ pub mod tenant_conf_defaults { pub const DEFAULT_GC_PERIOD: &str = "1 hr"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image - // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we - // want to enable this feature. - pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0; + // layer creation will end immediately. Set to 0 to disable. + pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3; pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; @@ -584,7 +603,7 @@ pub mod tenant_conf_defaults { // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; - pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 10240000; + pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; } diff --git a/libs/pageserver_api/src/config/tests.rs b/libs/pageserver_api/src/config/tests.rs index edeefc156e..9e61873273 100644 --- a/libs/pageserver_api/src/config/tests.rs +++ b/libs/pageserver_api/src/config/tests.rs @@ -16,6 +16,30 @@ fn test_node_metadata_v1_backward_compatibilty() { postgres_port: 23, http_host: "localhost".to_string(), http_port: 42, + https_port: None, + other: HashMap::new(), + } + ) +} + +#[test] +fn test_node_metadata_v2_backward_compatibilty() { + let v2 = serde_json::to_vec(&serde_json::json!({ + "host": "localhost", + "port": 23, + "http_host": "localhost", + "http_port": 42, + "https_port": 123, + })); + + assert_eq!( + serde_json::from_slice::(&v2.unwrap()).unwrap(), + NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: 23, + http_host: "localhost".to_string(), + http_port: 42, + https_port: Some(123), other: HashMap::new(), } ) diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index f94bfab581..3cb62f9d18 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -9,11 +9,8 @@ use std::time::{Duration, Instant}; use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId}; -use crate::models::PageserverUtilization; -use crate::{ - models::{ShardParameters, TenantConfig}, - shard::{ShardStripeSize, TenantShardId}, -}; +use crate::models::{PageserverUtilization, ShardParameters, TenantConfig}; +use crate::shard::{ShardStripeSize, TenantShardId}; #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] @@ -185,20 +182,66 @@ pub struct TenantDescribeResponseShard { #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateRequest { pub node_id: NodeId, + + /// Optionally, callers may specify the node they are migrating _from_, and the server will + /// reject the request if the shard is no longer attached there: this enables writing safer + /// clients that don't risk fighting with some other movement of the shard. #[serde(default)] - pub migration_config: Option, + pub origin_node_id: Option, + + #[serde(default)] + pub migration_config: MigrationConfig, } -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] pub struct MigrationConfig { + /// If true, the migration will be executed even if it is to a location with a sub-optimal scheduling + /// score: this is usually not what you want, and if you use this then you'll also need to set the + /// tenant's scheduling policy to Essential or Pause to avoid the optimiser reverting your migration. + /// + /// Default: false + #[serde(default)] + pub override_scheduler: bool, + + /// If true, the migration will be done gracefully by creating a secondary location first and + /// waiting for it to warm up before cutting over. If false, if there is no existing secondary + /// location at the destination, the tenant will be migrated immediately. If the tenant's data + /// can't be downloaded within [`Self::secondary_warmup_timeout`], then the migration will go + /// ahead but run with a cold cache that can severely reduce performance until it warms up. + /// + /// When doing a graceful migration, the migration API returns as soon as it is started. + /// + /// Default: true + #[serde(default = "default_prewarm")] + pub prewarm: bool, + + /// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait + /// overall for secondary warmup before cutting over #[serde(default)] #[serde(with = "humantime_serde")] pub secondary_warmup_timeout: Option, + /// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait + /// within each secondary download poll call to pageserver. #[serde(default)] #[serde(with = "humantime_serde")] pub secondary_download_request_timeout: Option, } +fn default_prewarm() -> bool { + true +} + +impl Default for MigrationConfig { + fn default() -> Self { + Self { + override_scheduler: false, + prewarm: default_prewarm(), + secondary_warmup_timeout: None, + secondary_download_request_timeout: None, + } + } +} + #[derive(Serialize, Clone, Debug)] #[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { @@ -354,7 +397,7 @@ impl FromStr for SkSchedulingPolicy { _ => { return Err(anyhow::anyhow!( "Unknown scheduling policy '{s}', try active,pause,decomissioned" - )) + )); } }) } @@ -446,6 +489,7 @@ pub struct SafekeeperDescribeResponse { pub host: String, pub port: i32, pub http_port: i32, + pub https_port: Option, pub availability_zone_id: String, pub scheduling_policy: SkSchedulingPolicy, } @@ -457,9 +501,10 @@ pub struct SafekeeperSchedulingPolicyRequest { #[cfg(test)] mod test { - use super::*; use serde_json; + use super::*; + /// Check stability of PlacementPolicy's serialization #[test] fn placement_policy_encoding() -> anyhow::Result<()> { @@ -489,4 +534,43 @@ mod test { err ); } + + /// Check that a minimal migrate request with no config results in the expected default settings + #[test] + fn test_migrate_request_decode_defaults() { + let json = r#"{ + "node_id": 123 + }"#; + + let request: TenantShardMigrateRequest = serde_json::from_str(json).unwrap(); + assert_eq!(request.node_id, NodeId(123)); + assert_eq!(request.origin_node_id, None); + assert!(!request.migration_config.override_scheduler); + assert!(request.migration_config.prewarm); + assert_eq!(request.migration_config.secondary_warmup_timeout, None); + assert_eq!( + request.migration_config.secondary_download_request_timeout, + None + ); + } + + /// Check that a partially specified migration config results in the expected default settings + #[test] + fn test_migration_config_decode_defaults() { + // Specify just one field of the config + let json = r#"{ + }"#; + + let config: MigrationConfig = serde_json::from_str(json).unwrap(); + + // Check each field's expected default value + assert!(!config.override_scheduler); + assert!(config.prewarm); + assert_eq!(config.secondary_warmup_timeout, None); + assert_eq!(config.secondary_download_request_timeout, None); + assert_eq!(config.secondary_warmup_timeout, None); + + // Consistency check that the Default impl agrees with our serde defaults + assert_eq!(MigrationConfig::default(), config); + } } diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index b88a2e46a1..8836e7ec87 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,11 +1,12 @@ -use anyhow::{bail, Result}; -use byteorder::{ByteOrder, BE}; +use std::fmt; +use std::ops::Range; + +use anyhow::{Result, bail}; +use byteorder::{BE, ByteOrder}; use bytes::Bytes; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::Oid; -use postgres_ffi::RepOriginId; +use postgres_ffi::{Oid, RepOriginId}; use serde::{Deserialize, Serialize}; -use std::{fmt, ops::Range}; use utils::const_assert; use crate::reltag::{BlockNumber, RelTag, SlruKind}; @@ -954,25 +955,22 @@ impl std::str::FromStr for Key { mod tests { use std::str::FromStr; - use crate::key::is_metadata_key_slice; - use crate::key::Key; - - use rand::Rng; - use rand::SeedableRng; + use rand::{Rng, SeedableRng}; use super::AUX_KEY_PREFIX; + use crate::key::{Key, is_metadata_key_slice}; #[test] fn display_fromstr_bijection() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let key = Key { - field1: rng.gen(), - field2: rng.gen(), - field3: rng.gen(), - field4: rng.gen(), - field5: rng.gen(), - field6: rng.gen(), + field1: rng.r#gen(), + field2: rng.r#gen(), + field3: rng.r#gen(), + field4: rng.r#gen(), + field5: rng.r#gen(), + field6: rng.r#gen(), }; assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index c55b9e9484..e505f23e49 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,11 +1,10 @@ -use postgres_ffi::BLCKSZ; use std::ops::Range; -use crate::{ - key::Key, - shard::{ShardCount, ShardIdentity}, -}; use itertools::Itertools; +use postgres_ffi::BLCKSZ; + +use crate::key::Key; +use crate::shard::{ShardCount, ShardIdentity}; /// /// Represents a set of Keys, in a compact form. @@ -609,15 +608,13 @@ pub fn singleton_range(key: Key) -> Range { #[cfg(test)] mod tests { + use std::fmt::Write; + use rand::{RngCore, SeedableRng}; - use crate::{ - models::ShardParameters, - shard::{ShardCount, ShardNumber}, - }; - use super::*; - use std::fmt::Write; + use crate::models::ShardParameters; + use crate::shard::{ShardCount, ShardNumber}; // Helper function to create a key range. // diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 1164048229..4a8f75413c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -2,38 +2,30 @@ pub mod detach_ancestor; pub mod partitioning; pub mod utilization; -#[cfg(feature = "testing")] -use camino::Utf8PathBuf; -pub use utilization::PageserverUtilization; - use core::ops::Range; -use std::{ - collections::HashMap, - fmt::Display, - io::{BufRead, Read}, - num::{NonZeroU32, NonZeroU64, NonZeroUsize}, - str::FromStr, - time::{Duration, SystemTime}, -}; +use std::collections::HashMap; +use std::fmt::Display; +use std::io::{BufRead, Read}; +use std::num::{NonZeroU32, NonZeroU64, NonZeroUsize}; +use std::str::FromStr; +use std::time::{Duration, SystemTime}; use byteorder::{BigEndian, ReadBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +#[cfg(feature = "testing")] +use camino::Utf8PathBuf; use postgres_ffi::BLCKSZ; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_with::serde_as; -use utils::{ - completion, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, - postgres_client::PostgresClientProtocol, - serde_system_time, -}; +pub use utilization::PageserverUtilization; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::postgres_client::PostgresClientProtocol; +use utils::{completion, serde_system_time}; -use crate::{ - key::{CompactKey, Key}, - reltag::RelTag, - shard::{ShardCount, ShardStripeSize, TenantShardId}, -}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; +use crate::key::{CompactKey, Key}; +use crate::reltag::RelTag; +use crate::shard::{ShardCount, ShardStripeSize, TenantShardId}; /// The state of a tenant in this pageserver. /// @@ -184,6 +176,39 @@ impl LsnLease { } } +/// Controls the detach ancestor behavior. +/// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point. +/// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all. +#[derive(Debug, Clone, Copy, Default)] +pub enum DetachBehavior { + #[default] + NoAncestorAndReparent, + MultiLevelAndNoReparent, +} + +impl std::str::FromStr for DetachBehavior { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s { + "no_ancestor_and_reparent" => Ok(DetachBehavior::NoAncestorAndReparent), + "multi_level_and_no_reparent" => Ok(DetachBehavior::MultiLevelAndNoReparent), + "v1" => Ok(DetachBehavior::NoAncestorAndReparent), + "v2" => Ok(DetachBehavior::MultiLevelAndNoReparent), + _ => Err("cannot parse detach behavior"), + } + } +} + +impl std::fmt::Display for DetachBehavior { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DetachBehavior::NoAncestorAndReparent => write!(f, "no_ancestor_and_reparent"), + DetachBehavior::MultiLevelAndNoReparent => write!(f, "multi_level_and_no_reparent"), + } + } +} + /// The only [`TenantState`] variants we could be `TenantState::Activating` from. /// /// XXX: We used to have more variants here, but now it's just one, which makes this rather @@ -282,6 +307,31 @@ pub struct TimelineCreateRequest { pub mode: TimelineCreateRequestMode, } +/// Storage controller specific extensions to [`TimelineInfo`]. +#[derive(Serialize, Deserialize, Clone)] +pub struct TimelineCreateResponseStorcon { + #[serde(flatten)] + pub timeline_info: TimelineInfo, + + pub safekeepers: Option, +} + +/// Safekeepers as returned in timeline creation request to storcon or pushed to +/// cplane in the post migration hook. +#[derive(Serialize, Deserialize, Clone)] +pub struct SafekeepersInfo { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub generation: u32, + pub safekeepers: Vec, +} + +#[derive(Serialize, Deserialize, Clone)] +pub struct SafekeeperInfo { + pub id: NodeId, + pub hostname: String, +} + #[derive(Serialize, Deserialize, Clone)] #[serde(untagged)] pub enum TimelineCreateRequestMode { @@ -332,7 +382,8 @@ pub struct ImportPgdataIdempotencyKey(pub String); impl ImportPgdataIdempotencyKey { pub fn random() -> Self { - use rand::{distributions::Alphanumeric, Rng}; + use rand::Rng; + use rand::distributions::Alphanumeric; Self( rand::thread_rng() .sample_iter(&Alphanumeric) @@ -1153,6 +1204,15 @@ pub struct TimelineArchivalConfigRequest { pub state: TimelineArchivalState, } +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct TimelinePatchIndexPartRequest { + pub rel_size_migration: Option, + pub gc_compaction_last_completed_lsn: Option, + pub applied_gc_cutoff_lsn: Option, + #[serde(default)] + pub force_index_update: bool, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelinesInfoAndOffloaded { pub timelines: Vec, @@ -1172,6 +1232,21 @@ pub struct OffloadedTimelineInfo { pub archived_at: chrono::DateTime, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum RelSizeMigration { + /// The tenant is using the old rel_size format. + /// Note that this enum is persisted as `Option` in the index part, so + /// `None` is the same as `Some(RelSizeMigration::Legacy)`. + Legacy, + /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are + /// persisted in the index part. The read path will read both formats and merge them. + Migrating, + /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted + /// in the index part, and the read path will not read the old format. + Migrated, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { @@ -1183,9 +1258,10 @@ pub struct TimelineInfo { pub last_record_lsn: Lsn, pub prev_record_lsn: Option, - /// Legacy field for compat with control plane. Synonym of `min_readable_lsn`. - /// TODO: remove once control plane no longer reads it. - pub latest_gc_cutoff_lsn: Lsn, + /// Legacy field, retained for one version to enable old storage controller to + /// decode (it was a mandatory field). + #[serde(default, rename = "latest_gc_cutoff_lsn")] + pub _unused: Lsn, /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients. /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead, @@ -1250,7 +1326,11 @@ pub struct TimelineInfo { // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does // not deny unknown fields by default so it's safe to set the field to some value, though it won't be // read. + /// Whether the timeline is archived. pub is_archived: Option, + + /// The status of the rel_size migration. + pub rel_size_migration: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -1430,8 +1510,14 @@ pub struct TenantScanRemoteStorageResponse { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "snake_case")] pub enum TenantSorting { + /// Total size of layers on local disk for all timelines in a shard. ResidentSize, + /// The logical size of the largest timeline within a _tenant_ (not shard). Only tracked on + /// shard 0, contains the sum across all shards. MaxLogicalSize, + /// The logical size of the largest timeline within a _tenant_ (not shard), divided by number of + /// shards. Only tracked on shard 0, and estimates the per-shard logical size. + MaxLogicalSizePerShard, } impl Default for TenantSorting { @@ -1461,14 +1547,20 @@ pub struct TopTenantShardsRequest { pub struct TopTenantShardItem { pub id: TenantShardId, - /// Total size of layers on local disk for all timelines in this tenant + /// Total size of layers on local disk for all timelines in this shard. pub resident_size: u64, - /// Total size of layers in remote storage for all timelines in this tenant + /// Total size of layers in remote storage for all timelines in this shard. pub physical_size: u64, - /// The largest logical size of a timeline within this tenant + /// The largest logical size of a timeline within this _tenant_ (not shard). This is only + /// tracked on shard 0, and contains the sum of the logical size across all shards. pub max_logical_size: u64, + + /// The largest logical size of a timeline within this _tenant_ (not shard) divided by number of + /// shards. This is only tracked on shard 0, and is only an estimate as we divide it evenly by + /// shard count, rounded up. + pub max_logical_size_per_shard: u64, } #[derive(Serialize, Deserialize, Debug, Default)] @@ -2288,9 +2380,10 @@ impl Default for PageTraceEvent { #[cfg(test)] mod tests { - use serde_json::json; use std::str::FromStr; + use serde_json::json; + use super::*; #[test] diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index 641aa51989..69c240ff3c 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,5 +1,7 @@ use std::time::SystemTime; -use utils::{serde_percent::Percent, serde_system_time}; + +use utils::serde_percent::Percent; +use utils::serde_system_time; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -131,12 +133,12 @@ impl PageserverUtilization { /// Test helper pub mod test_utilization { - use super::PageserverUtilization; use std::time::SystemTime; - use utils::{ - serde_percent::Percent, - serde_system_time::{self}, - }; + + use utils::serde_percent::Percent; + use utils::serde_system_time::{self}; + + use super::PageserverUtilization; // Parameters of the imaginary node used for test utilization instances const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024; diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs index bb62b35d36..fda504a26e 100644 --- a/libs/pageserver_api/src/record.rs +++ b/libs/pageserver_api/src/record.rs @@ -1,7 +1,7 @@ //! This module defines the WAL record format used within the pageserver. use bytes::Bytes; -use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember}; +use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record}; use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 09d1fae221..473a44dbf9 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -1,10 +1,10 @@ -use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; -use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM}; use postgres_ffi::Oid; +use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; +use postgres_ffi::relfile_utils::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name}; +use serde::{Deserialize, Serialize}; /// /// Relation data file segment id throughout the Postgres cluster. diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index e03df02afb..8386d6e586 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -33,12 +33,13 @@ use std::hash::{Hash, Hasher}; -use crate::{key::Key, models::ShardParameters}; +#[doc(inline)] +pub use ::utils::shard::*; use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; -#[doc(inline)] -pub use ::utils::shard::*; +use crate::key::Key; +use crate::models::ShardParameters; /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], /// and to check whether that [`ShardNumber`] is the same as the current shard. @@ -111,6 +112,16 @@ impl ShardIdentity { } } + /// An unsharded identity with the given stripe size (if non-zero). This is typically used to + /// carry over a stripe size for an unsharded tenant from persistent storage. + pub fn unsharded_with_stripe_size(stripe_size: ShardStripeSize) -> Self { + let mut shard_identity = Self::unsharded(); + if stripe_size.0 > 0 { + shard_identity.stripe_size = stripe_size; + } + shard_identity + } + /// A broken instance of this type is only used for `TenantState::Broken` tenants, /// which are constructed in code paths that don't have access to proper configuration. /// @@ -337,7 +348,8 @@ pub fn describe( mod tests { use std::str::FromStr; - use utils::{id::TenantId, Hex}; + use utils::Hex; + use utils::id::TenantId; use super::*; diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 2e88836bd0..647d01c3c2 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -6,9 +6,9 @@ use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::{ - controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId, -}; +use crate::controller_api::NodeRegisterRequest; +use crate::models::LocationConfigMode; +use crate::shard::TenantShardId; /// Upcall message sent by the pageserver to the configured `control_plane_api` on /// startup. @@ -30,7 +30,7 @@ fn default_mode() -> LocationConfigMode { pub struct ReAttachResponseTenant { pub id: TenantShardId, /// Mandatory if LocationConfigMode is None or set to an Attached* mode - pub gen: Option, + pub r#gen: Option, /// Default value only for backward compat: this field should be set #[serde(default = "default_mode")] @@ -44,7 +44,7 @@ pub struct ReAttachResponse { #[derive(Serialize, Deserialize)] pub struct ValidateRequestTenant { pub id: TenantShardId, - pub gen: u32, + pub r#gen: u32, } #[derive(Serialize, Deserialize)] diff --git a/libs/pageserver_api/src/value.rs b/libs/pageserver_api/src/value.rs index 1f8ed30a9a..883d903ff3 100644 --- a/libs/pageserver_api/src/value.rs +++ b/libs/pageserver_api/src/value.rs @@ -7,10 +7,11 @@ //! Note that the [`Value`] type is used for the permananent storage format, so any //! changes to it must be backwards compatible. -use crate::record::NeonWalRecord; use bytes::Bytes; use serde::{Deserialize, Serialize}; +use crate::record::NeonWalRecord; + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum Value { /// An Image value contains a full copy of the value @@ -83,11 +84,11 @@ impl ValueBytes { #[cfg(test)] mod test { - use super::*; - use bytes::Bytes; use utils::bin_ser::BeSer; + use super::*; + macro_rules! roundtrip { ($orig:expr, $expected:expr) => {{ let orig: Value = $orig; diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index f74b229ac4..a0a891f0dc 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -4,28 +4,28 @@ //! is rather narrow, but we can extend it once required. #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use anyhow::Context; -use bytes::Bytes; -use serde::{Deserialize, Serialize}; +use std::future::Future; use std::io::ErrorKind; use std::net::SocketAddr; -use std::os::fd::AsRawFd; -use std::os::fd::RawFd; +use std::os::fd::{AsRawFd, RawFd}; use std::pin::Pin; +use std::str::FromStr; use std::sync::Arc; -use std::task::{ready, Poll}; +use std::task::{Poll, ready}; use std::{fmt, io}; -use std::{future::Future, str::FromStr}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_rustls::TlsAcceptor; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, trace, warn}; +use anyhow::Context; +use bytes::Bytes; use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; use pq_proto::{ BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN, SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION, }; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, trace, warn}; /// An error, occurred during query processing: /// either during the connection ([`ConnectionError`]) or before/after it. @@ -746,7 +746,7 @@ impl PostgresBackend { match e { QueryError::Shutdown => return Ok(ProcessMsgResult::Break), QueryError::SimulatedConnectionError => { - return Err(QueryError::SimulatedConnectionError) + return Err(QueryError::SimulatedConnectionError); } err @ QueryError::Reconnect => { // Instruct the client to reconnect, stop processing messages @@ -1020,7 +1020,9 @@ fn log_query_error(query: &str, e: &QueryError) { } } QueryError::Disconnected(other_connection_error) => { - error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") + error!( + "query handler for '{query}' failed with connection error: {other_connection_error:?}" + ) } QueryError::SimulatedConnectionError => { error!("query handler for query '{query}' failed due to a simulated connection error") diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 3fcfbf4a03..907ef9eed3 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -1,10 +1,11 @@ +use std::io::Cursor; +use std::sync::Arc; + /// Test postgres_backend_async with tokio_postgres use once_cell::sync::Lazy; use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; use pq_proto::{BeMessage, RowDescriptor}; use rustls::crypto::ring; -use std::io::Cursor; -use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio_postgres::config::SslMode; diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index e3d31c6cfc..cd981b3729 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -1,9 +1,10 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use anyhow::{bail, Context}; -use itertools::Itertools; use std::borrow::Cow; use std::fmt; + +use anyhow::{Context, bail}; +use itertools::Itertools; use url::Host; /// Parses a string of format either `host:port` or `host` into a corresponding pair. @@ -29,9 +30,10 @@ pub fn parse_host_port>(host_port: S) -> Result<(Host, Option #[cfg(test)] mod tests_parse_host_port { - use crate::parse_host_port; use url::Host; + use crate::parse_host_port; + #[test] fn test_normal() { let (host, port) = parse_host_port("hello:123").unwrap(); @@ -207,10 +209,11 @@ impl fmt::Debug for PgConnectionConfig { #[cfg(test)] mod tests_pg_connection_config { - use crate::PgConnectionConfig; use once_cell::sync::Lazy; use url::Host; + use crate::PgConnectionConfig; + static STUB_HOST: Lazy = Lazy::new(|| Host::Domain("stub.host.example".to_owned())); #[test] diff --git a/libs/postgres_ffi/benches/waldecoder.rs b/libs/postgres_ffi/benches/waldecoder.rs index c8cf0d322a..2e1d62e452 100644 --- a/libs/postgres_ffi/benches/waldecoder.rs +++ b/libs/postgres_ffi/benches/waldecoder.rs @@ -1,6 +1,6 @@ use std::ffi::CStr; -use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use postgres_ffi::v17::wal_generator::LogicalMessageGenerator; use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler; use postgres_ffi::waldecoder::WalStreamDecoder; diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index d3a85f2683..cdebd43f6f 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -4,7 +4,7 @@ use std::env; use std::path::PathBuf; use std::process::Command; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use bindgen::callbacks::{DeriveInfo, ParseCallbacks}; #[derive(Debug)] diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 301bc2f16e..05d8de4c7a 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -21,7 +21,9 @@ macro_rules! postgres_ffi { pub mod bindings { // bindgen generates bindings for a lot of stuff we don't need #![allow(dead_code)] + #![allow(unsafe_op_in_unsafe_fn)] #![allow(clippy::undocumented_unsafe_blocks)] + #![allow(clippy::ptr_offset_with_cast)] use serde::{Deserialize, Serialize}; include!(concat!( @@ -43,8 +45,7 @@ macro_rules! postgres_ffi { pub const PG_MAJORVERSION: &str = stringify!($version); // Re-export some symbols from bindings - pub use bindings::DBState_DB_SHUTDOWNED; - pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; + pub use bindings::{CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, XLogRecord}; pub const ZERO_CHECKPOINT: bytes::Bytes = bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]); @@ -221,21 +222,17 @@ pub mod relfile_utils; pub mod walrecord; // Export some widely used datatypes that are unlikely to change across Postgres versions -pub use v14::bindings::RepOriginId; -pub use v14::bindings::{uint32, uint64, Oid}; -pub use v14::bindings::{BlockNumber, OffsetNumber}; -pub use v14::bindings::{MultiXactId, TransactionId}; -pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; - +pub use v14::bindings::{ + BlockNumber, CheckPoint, ControlFileData, MultiXactId, OffsetNumber, Oid, PageHeaderData, + RepOriginId, TimeLineID, TimestampTz, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32, + uint64, +}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; -pub use v14::bindings::{PageHeaderData, XLogRecord}; pub use v14::xlog_utils::{ XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; -pub use v14::bindings::{CheckPoint, ControlFileData}; - // from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and // --with-segsize=SEGSIZE, but assume the defaults for now. pub const BLCKSZ: u16 = 8192; @@ -246,13 +243,11 @@ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; // Export some version independent functions that are used outside of this mod -pub use v14::xlog_utils::encode_logical_message; -pub use v14::xlog_utils::get_current_timestamp; -pub use v14::xlog_utils::to_pg_timestamp; -pub use v14::xlog_utils::try_from_pg_timestamp; -pub use v14::xlog_utils::XLogFileName; - pub use v14::bindings::DBState_DB_SHUTDOWNED; +pub use v14::xlog_utils::{ + XLogFileName, encode_logical_message, get_current_timestamp, to_pg_timestamp, + try_from_pg_timestamp, +}; pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info)) @@ -355,8 +350,9 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber { } pub mod waldecoder { - use bytes::{Buf, Bytes, BytesMut}; use std::num::NonZeroU32; + + use bytes::{Buf, Bytes, BytesMut}; use thiserror::Error; use utils::lsn::Lsn; @@ -400,6 +396,14 @@ pub mod waldecoder { self.lsn + self.inputbuf.remaining() as u64 } + /// Returns the LSN up to which the WAL decoder has processed. + /// + /// If [`Self::poll_decode`] returned a record, then this will return + /// the end LSN of said record. + pub fn lsn(&self) -> Lsn { + self.lsn + } + pub fn feed_bytes(&mut self, buf: &[u8]) { self.inputbuf.extend_from_slice(buf); } diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index e343473d77..b0bdd8a8da 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -9,8 +9,7 @@ //! comments on them. //! -use crate::PageHeaderData; -use crate::BLCKSZ; +use crate::{BLCKSZ, PageHeaderData}; // // From pg_tablespace_d.h diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index fce37e2fdd..1ccf4590a9 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -3,18 +3,16 @@ //! //! TODO: Generate separate types for each supported PG version -use crate::pg_constants; -use crate::XLogRecord; -use crate::{ - BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, TimestampTz, - TransactionId, -}; -use crate::{BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD}; use bytes::{Buf, Bytes}; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; +use crate::{ + BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, + TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants, +}; + #[repr(C)] #[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlMultiXactCreate { @@ -508,9 +506,10 @@ pub fn decode_wal_record( } pub mod v14 { - use crate::{OffsetNumber, TransactionId}; use bytes::{Buf, Bytes}; + use crate::{OffsetNumber, TransactionId}; + #[repr(C)] #[derive(Debug)] pub struct XlHeapInsert { @@ -678,9 +677,10 @@ pub mod v15 { } pub mod v16 { + use bytes::{Buf, Bytes}; + pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange}; use crate::{OffsetNumber, TransactionId}; - use bytes::{Buf, Bytes}; pub struct XlHeapDelete { pub xmax: TransactionId, @@ -746,9 +746,10 @@ pub mod v16 { /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */ pub mod rm_neon { - use crate::{OffsetNumber, TransactionId}; use bytes::{Buf, Bytes}; + use crate::{OffsetNumber, TransactionId}; + #[repr(C)] #[derive(Debug)] pub struct XlNeonHeapInsert { @@ -858,14 +859,14 @@ pub mod v16 { } pub mod v17 { - pub use super::v14::XlHeapLockUpdated; - pub use crate::{TimeLineID, TimestampTz}; use bytes::{Buf, Bytes}; - pub use super::v16::rm_neon; + pub use super::v14::XlHeapLockUpdated; pub use super::v16::{ XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, + rm_neon, }; + pub use crate::{TimeLineID, TimestampTz}; #[repr(C)] #[derive(Debug)] diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 41afcea6c2..6151ce34ac 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -1,7 +1,9 @@ +use std::path::PathBuf; +use std::str::FromStr; + use anyhow::*; -use clap::{value_parser, Arg, ArgMatches, Command}; +use clap::{Arg, ArgMatches, Command, value_parser}; use postgres::Client; -use std::{path::PathBuf, str::FromStr}; use wal_craft::*; fn main() -> Result<()> { diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 77dff4ac99..ca9530faef 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -1,17 +1,18 @@ -use anyhow::{bail, ensure}; -use camino_tempfile::{tempdir, Utf8TempDir}; -use log::*; -use postgres::types::PgLsn; -use postgres::Client; -use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; -use postgres_ffi::{ - XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, -}; use std::ffi::OsStr; use std::path::{Path, PathBuf}; use std::process::Command; use std::time::{Duration, Instant}; +use anyhow::{bail, ensure}; +use camino_tempfile::{Utf8TempDir, tempdir}; +use log::*; +use postgres::Client; +use postgres::types::PgLsn; +use postgres_ffi::{ + WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, + XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; + macro_rules! xlog_utils_test { ($version:ident) => { #[path = "."] diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index ccbb90e384..8e216d0f44 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -10,11 +10,10 @@ //! calls. //! //! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107 +use std::future::Future; +use std::io::{self, ErrorKind}; + use bytes::{Buf, BytesMut}; -use std::{ - future::Future, - io::{self, ErrorKind}, -}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadHalf, WriteHalf}; use crate::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index f99128b76a..e435ffbf7e 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -5,14 +5,15 @@ pub mod framed; +use std::borrow::Cow; +use std::{fmt, io, str}; + use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use itertools::Itertools; -use serde::{Deserialize, Serialize}; -use std::{borrow::Cow, fmt, io, str}; - // re-export for use in utils pageserver_feedback.rs pub use postgres_protocol::PG_EPOCH; +use serde::{Deserialize, Serialize}; pub type Oid = u32; pub type SystemId = u64; @@ -206,8 +207,8 @@ use rand::distributions::{Distribution, Standard}; impl Distribution for Standard { fn sample(&self, rng: &mut R) -> CancelKeyData { CancelKeyData { - backend_pid: rng.gen(), - cancel_key: rng.gen(), + backend_pid: rng.r#gen(), + cancel_key: rng.r#gen(), } } } @@ -1035,7 +1036,7 @@ impl BeMessage<'_> { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_u8(b'0'); // matches INTERPRETED_WAL_RECORD_TAG in postgres-protocol - // dependency + // dependency buf.put_u64(rec.streaming_lsn); buf.put_u64(rec.commit_lsn); buf.put_slice(rec.data); diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index 0ccd8c295f..b6bcabc922 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -135,8 +135,8 @@ impl Type { pub enum Kind { /// A simple type like `VARCHAR` or `INTEGER`. Simple, - /// An enumerated type along with its variants. - Enum(Vec), + /// An enumerated type. + Enum, /// A pseudo-type. Pseudo, /// An array type along with the type of its elements. @@ -146,9 +146,9 @@ pub enum Kind { /// A multirange type along with the type of its elements. Multirange(Type), /// A domain type along with its underlying type. - Domain(Type), - /// A composite type along with information about its fields. - Composite(Vec), + Domain(Oid), + /// A composite type. + Composite(Oid), } /// Information about a field of a composite type. diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs index b65fb571e6..0bdad0b554 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -34,8 +34,13 @@ where .make_tls_connect(hostname) .map_err(|e| Error::tls(e.into()))?; - let socket = - connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?; + let socket = connect_socket::connect_socket( + config.host_addr, + &config.host, + config.port, + config.connect_timeout, + ) + .await?; cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await } diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 39b1db75da..186eb07000 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::fmt; +use std::net::IpAddr; use std::sync::Arc; use std::task::{Context, Poll}; use std::time::Duration; @@ -18,10 +19,10 @@ use crate::config::{Host, SslMode}; use crate::connection::{Request, RequestMessages}; use crate::query::RowStream; use crate::simple_query::SimpleQueryStream; -use crate::types::{Oid, ToSql, Type}; +use crate::types::{Oid, Type}; use crate::{ - CancelToken, Error, ReadyForQueryStatus, Row, SimpleQueryMessage, Statement, Transaction, - TransactionBuilder, query, simple_query, slice_iter, + CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Statement, Transaction, + TransactionBuilder, query, simple_query, }; pub struct Responses { @@ -53,26 +54,18 @@ impl Responses { /// A cache of type info and prepared statements for fetching type info /// (corresponding to the queries in the [crate::prepare] module). #[derive(Default)] -struct CachedTypeInfo { +pub(crate) struct CachedTypeInfo { /// A statement for basic information for a type from its /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its /// fallback). - typeinfo: Option, - /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY). - typeinfo_composite: Option, - /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or - /// its fallback). - typeinfo_enum: Option, + pub(crate) typeinfo: Option, /// Cache of types already looked up. - types: HashMap, + pub(crate) types: HashMap, } pub struct InnerClient { sender: mpsc::UnboundedSender, - cached_typeinfo: Mutex, /// A buffer to use when writing out postgres commands. buffer: Mutex, @@ -90,38 +83,6 @@ impl InnerClient { }) } - pub fn typeinfo(&self) -> Option { - self.cached_typeinfo.lock().typeinfo.clone() - } - - pub fn set_typeinfo(&self, statement: &Statement) { - self.cached_typeinfo.lock().typeinfo = Some(statement.clone()); - } - - pub fn typeinfo_composite(&self) -> Option { - self.cached_typeinfo.lock().typeinfo_composite.clone() - } - - pub fn set_typeinfo_composite(&self, statement: &Statement) { - self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone()); - } - - pub fn typeinfo_enum(&self) -> Option { - self.cached_typeinfo.lock().typeinfo_enum.clone() - } - - pub fn set_typeinfo_enum(&self, statement: &Statement) { - self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone()); - } - - pub fn type_(&self, oid: Oid) -> Option { - self.cached_typeinfo.lock().types.get(&oid).cloned() - } - - pub fn set_type(&self, oid: Oid, type_: &Type) { - self.cached_typeinfo.lock().types.insert(oid, type_.clone()); - } - /// Call the given function with a buffer to be used when writing out /// postgres commands. pub fn with_buf(&self, f: F) -> R @@ -137,10 +98,10 @@ impl InnerClient { #[derive(Clone, Serialize, Deserialize)] pub struct SocketConfig { + pub host_addr: Option, pub host: Host, pub port: u16, pub connect_timeout: Option, - // pub keepalive: Option, } /// An asynchronous PostgreSQL client. @@ -149,6 +110,7 @@ pub struct SocketConfig { /// through this client object. pub struct Client { inner: Arc, + cached_typeinfo: CachedTypeInfo, socket_config: SocketConfig, ssl_mode: SslMode, @@ -167,9 +129,9 @@ impl Client { Client { inner: Arc::new(InnerClient { sender, - cached_typeinfo: Default::default(), buffer: Default::default(), }), + cached_typeinfo: Default::default(), socket_config, ssl_mode, @@ -187,55 +149,6 @@ impl Client { &self.inner } - /// Executes a statement, returning a vector of the resulting rows. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - pub async fn query( - &self, - statement: Statement, - params: &[&(dyn ToSql + Sync)], - ) -> Result, Error> { - self.query_raw(statement, slice_iter(params)) - .await? - .try_collect() - .await - } - - /// The maximally flexible version of [`query`]. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - /// - /// [`query`]: #method.query - pub async fn query_raw<'a, I>( - &self, - statement: Statement, - params: I, - ) -> Result - where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, - { - query::query(&self.inner, statement, params).await - } - /// Pass text directly to the Postgres backend to allow it to sort out typing itself and /// to save a roundtrip pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result @@ -282,6 +195,14 @@ impl Client { simple_query::batch_execute(self.inner(), query).await } + pub async fn discard_all(&mut self) -> Result { + // clear the prepared statements that are about to be nuked from the postgres session + + self.cached_typeinfo.typeinfo = None; + + self.batch_execute("discard all").await + } + /// Begins a new database transaction. /// /// The transaction will roll back by default - use the `commit` method to commit it. @@ -345,8 +266,8 @@ impl Client { } /// Query for type information - pub async fn get_type(&self, oid: Oid) -> Result { - crate::prepare::get_type(&self.inner, oid).await + pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result { + crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await } /// Determines if the connection to the server has already closed. diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 4c25491b67..978d348741 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -1,5 +1,6 @@ //! Connection configuration. +use std::net::IpAddr; use std::time::Duration; use std::{fmt, str}; @@ -65,6 +66,7 @@ pub enum AuthKeys { /// Connection configuration. #[derive(Clone, PartialEq, Eq)] pub struct Config { + pub(crate) host_addr: Option, pub(crate) host: Host, pub(crate) port: u16, @@ -83,6 +85,7 @@ impl Config { /// Creates a new configuration. pub fn new(host: String, port: u16) -> Config { Config { + host_addr: None, host: Host::Tcp(host), port, password: None, @@ -163,6 +166,15 @@ impl Config { self } + pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config { + self.host_addr = Some(addr); + self + } + + pub fn get_host_addr(&self) -> Option { + self.host_addr + } + /// Sets the SSL configuration. /// /// Defaults to `prefer`. diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index d2bd0dfbcd..7c3a358bba 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -1,3 +1,5 @@ +use std::net::IpAddr; + use postgres_protocol2::message::backend::Message; use tokio::net::TcpStream; use tokio::sync::mpsc; @@ -25,13 +27,14 @@ where .make_tls_connect(hostname) .map_err(|e| Error::tls(e.into()))?; - match connect_once(&config.host, config.port, tls, config).await { + match connect_once(config.host_addr, &config.host, config.port, tls, config).await { Ok((client, connection)) => Ok((client, connection)), Err(e) => Err(e), } } async fn connect_once( + host_addr: Option, host: &Host, port: u16, tls: T, @@ -40,7 +43,7 @@ async fn connect_once( where T: TlsConnect, { - let socket = connect_socket(host, port, config.connect_timeout).await?; + let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?; let RawConnection { stream, parameters, @@ -50,6 +53,7 @@ where } = connect_raw(socket, tls, config).await?; let socket_config = SocketConfig { + host_addr, host: host.clone(), port, connect_timeout: config.connect_timeout, diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs index 15411f7ef3..8c7d300451 100644 --- a/libs/proxy/tokio-postgres2/src/connect_socket.rs +++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs @@ -1,5 +1,6 @@ use std::future::Future; use std::io; +use std::net::{IpAddr, SocketAddr}; use std::time::Duration; use tokio::net::{self, TcpStream}; @@ -9,15 +10,20 @@ use crate::Error; use crate::config::Host; pub(crate) async fn connect_socket( + host_addr: Option, host: &Host, port: u16, connect_timeout: Option, ) -> Result { match host { Host::Tcp(host) => { - let addrs = net::lookup_host((&**host, port)) - .await - .map_err(Error::connect)?; + let addrs = match host_addr { + Some(addr) => vec![SocketAddr::new(addr, port)], + None => net::lookup_host((&**host, port)) + .await + .map_err(Error::connect)? + .collect(), + }; let mut last_err = None; diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 31c3d8fa3e..8e28843347 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -22,7 +22,7 @@ pub trait GenericClient: private::Sealed { I::IntoIter: ExactSizeIterator + Sync + Send; /// Query for type information - async fn get_type(&self, oid: Oid) -> Result; + async fn get_type(&mut self, oid: Oid) -> Result; } impl private::Sealed for Client {} @@ -38,8 +38,8 @@ impl GenericClient for Client { } /// Query for type information - async fn get_type(&self, oid: Oid) -> Result { - crate::prepare::get_type(self.inner(), oid).await + async fn get_type(&mut self, oid: Oid) -> Result { + self.get_type_inner(oid).await } } @@ -56,7 +56,7 @@ impl GenericClient for Transaction<'_> { } /// Query for type information - async fn get_type(&self, oid: Oid) -> Result { - self.client().get_type(oid).await + async fn get_type(&mut self, oid: Oid) -> Result { + self.client_mut().get_type(oid).await } } diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index b36d2e5f74..ba13a528f6 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -9,10 +9,10 @@ use log::debug; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; -use crate::client::InnerClient; +use crate::client::{CachedTypeInfo, InnerClient}; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; -use crate::types::{Field, Kind, Oid, Type}; +use crate::types::{Kind, Oid, Type}; use crate::{Column, Error, Statement, query, slice_iter}; pub(crate) const TYPEINFO_QUERY: &str = "\ @@ -23,23 +23,7 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid WHERE t.oid = $1 "; -const TYPEINFO_ENUM_QUERY: &str = "\ -SELECT enumlabel -FROM pg_catalog.pg_enum -WHERE enumtypid = $1 -ORDER BY enumsortorder -"; - -pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\ -SELECT attname, atttypid -FROM pg_catalog.pg_attribute -WHERE attrelid = $1 -AND NOT attisdropped -AND attnum > 0 -ORDER BY attnum -"; - -pub async fn prepare( +async fn prepare_typecheck( client: &Arc, name: &'static str, query: &str, @@ -67,7 +51,7 @@ pub async fn prepare( let mut parameters = vec![]; let mut it = parameter_description.parameters(); while let Some(oid) = it.next().map_err(Error::parse)? { - let type_ = get_type(client, oid).await?; + let type_ = Type::from_oid(oid).ok_or_else(Error::unexpected_message)?; parameters.push(type_); } @@ -75,7 +59,7 @@ pub async fn prepare( if let Some(row_description) = row_description { let mut it = row_description.fields(); while let Some(field) = it.next().map_err(Error::parse)? { - let type_ = get_type(client, field.type_oid()).await?; + let type_ = Type::from_oid(field.type_oid()).ok_or_else(Error::unexpected_message)?; let column = Column::new(field.name().to_string(), type_, field); columns.push(column); } @@ -84,15 +68,6 @@ pub async fn prepare( Ok(Statement::new(client, name, parameters, columns)) } -fn prepare_rec<'a>( - client: &'a Arc, - name: &'static str, - query: &'a str, - types: &'a [Type], -) -> Pin> + 'a + Send>> { - Box::pin(prepare(client, name, query, types)) -} - fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { if types.is_empty() { debug!("preparing query {}: {}", name, query); @@ -108,16 +83,20 @@ fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Resu }) } -pub async fn get_type(client: &Arc, oid: Oid) -> Result { +pub async fn get_type( + client: &Arc, + typecache: &mut CachedTypeInfo, + oid: Oid, +) -> Result { if let Some(type_) = Type::from_oid(oid) { return Ok(type_); } - if let Some(type_) = client.type_(oid) { - return Ok(type_); - } + if let Some(type_) = typecache.types.get(&oid) { + return Ok(type_.clone()); + }; - let stmt = typeinfo_statement(client).await?; + let stmt = typeinfo_statement(client, typecache).await?; let rows = query::query(client, stmt, slice_iter(&[&oid])).await?; pin_mut!(rows); @@ -136,100 +115,48 @@ pub async fn get_type(client: &Arc, oid: Oid) -> Result( client: &'a Arc, + typecache: &'a mut CachedTypeInfo, oid: Oid, ) -> Pin> + Send + 'a>> { - Box::pin(get_type(client, oid)) + Box::pin(get_type(client, typecache, oid)) } -async fn typeinfo_statement(client: &Arc) -> Result { - if let Some(stmt) = client.typeinfo() { - return Ok(stmt); +async fn typeinfo_statement( + client: &Arc, + typecache: &mut CachedTypeInfo, +) -> Result { + if let Some(stmt) = &typecache.typeinfo { + return Ok(stmt.clone()); } let typeinfo = "neon_proxy_typeinfo"; - let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?; + let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY, &[]).await?; - client.set_typeinfo(&stmt); - Ok(stmt) -} - -async fn get_enum_variants(client: &Arc, oid: Oid) -> Result, Error> { - let stmt = typeinfo_enum_statement(client).await?; - - query::query(client, stmt, slice_iter(&[&oid])) - .await? - .and_then(|row| async move { row.try_get(0) }) - .try_collect() - .await -} - -async fn typeinfo_enum_statement(client: &Arc) -> Result { - if let Some(stmt) = client.typeinfo_enum() { - return Ok(stmt); - } - - let typeinfo = "neon_proxy_typeinfo_enum"; - let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?; - - client.set_typeinfo_enum(&stmt); - Ok(stmt) -} - -async fn get_composite_fields(client: &Arc, oid: Oid) -> Result, Error> { - let stmt = typeinfo_composite_statement(client).await?; - - let rows = query::query(client, stmt, slice_iter(&[&oid])) - .await? - .try_collect::>() - .await?; - - let mut fields = vec![]; - for row in rows { - let name = row.try_get(0)?; - let oid = row.try_get(1)?; - let type_ = get_type_rec(client, oid).await?; - fields.push(Field::new(name, type_)); - } - - Ok(fields) -} - -async fn typeinfo_composite_statement(client: &Arc) -> Result { - if let Some(stmt) = client.typeinfo_composite() { - return Ok(stmt); - } - - let typeinfo = "neon_proxy_typeinfo_composite"; - let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?; - - client.set_typeinfo_composite(&stmt); + typecache.typeinfo = Some(stmt.clone()); Ok(stmt) } diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs index eecbfc5873..f32603470f 100644 --- a/libs/proxy/tokio-postgres2/src/transaction.rs +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -72,4 +72,9 @@ impl<'a> Transaction<'a> { pub fn client(&self) -> &Client { self.client } + + /// Returns a reference to the underlying `Client`. + pub fn client_mut(&mut self) -> &mut Client { + self.client + } } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 33fa6e89f5..7bdf340f74 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "remote_storage" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 9027a8bf55..dee61a410d 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -2,33 +2,26 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::env; use std::fmt::Display; -use std::io; use std::num::NonZeroU32; use std::pin::Pin; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::SystemTime; +use std::time::{Duration, SystemTime}; +use std::{env, io}; -use super::REMOTE_STORAGE_PREFIX_SEPARATOR; -use anyhow::Context; -use anyhow::Result; +use anyhow::{Context, Result}; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; -use azure_core::HttpClient; -use azure_core::TransportOptions; -use azure_core::{Continuable, RetryOptions}; +use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions}; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::CopyStatus; -use azure_storage_blobs::prelude::ClientBuilder; -use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; +use azure_storage_blobs::blob::operations::GetBlobBuilder; +use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; use bytes::Bytes; +use futures::FutureExt; use futures::future::Either; use futures::stream::Stream; -use futures::FutureExt; -use futures_util::StreamExt; -use futures_util::TryStreamExt; +use futures_util::{StreamExt, TryStreamExt}; use http_types::{StatusCode, Url}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -36,12 +29,13 @@ use tracing::debug; use utils::backoff; use utils::backoff::exponential_backoff_duration_seconds; -use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; -use crate::DownloadKind; +use super::REMOTE_STORAGE_PREFIX_SEPARATOR; +use crate::config::AzureConfig; +use crate::error::Cancelled; +use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests}; use crate::{ - config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, - DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata, - TimeTravelError, TimeoutOrCancel, + ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode, + ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, }; pub struct AzureBlobStorage { diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index ff34158c9c..52978be5b4 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -1,8 +1,10 @@ -use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration}; +use std::fmt::Debug; +use std::num::NonZeroUsize; +use std::str::FromStr; +use std::time::Duration; use aws_sdk_s3::types::StorageClass; use camino::Utf8PathBuf; - use serde::{Deserialize, Serialize}; use crate::{ diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 69b522d63e..6eb5570d9b 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -18,40 +18,35 @@ mod s3_bucket; mod simulate_failures; mod support; -use std::{ - collections::HashMap, - fmt::Debug, - num::NonZeroU32, - ops::Bound, - pin::{pin, Pin}, - sync::Arc, - time::SystemTime, -}; +use std::collections::HashMap; +use std::fmt::Debug; +use std::num::NonZeroU32; +use std::ops::Bound; +use std::pin::{Pin, pin}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; - +/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. +pub use azure_core::Etag; use bytes::Bytes; -use futures::{stream::Stream, StreamExt}; +use camino::{Utf8Path, Utf8PathBuf}; +pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; +use futures::StreamExt; +use futures::stream::Stream; use itertools::Itertools as _; +use s3_bucket::RequestKind; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::info; -pub use self::{ - azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket, - simulate_failures::UnreliableWrapper, -}; -use s3_bucket::RequestKind; - +pub use self::azure_blob::AzureBlobStorage; +pub use self::local_fs::LocalFs; +pub use self::s3_bucket::S3Bucket; +pub use self::simulate_failures::UnreliableWrapper; pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config}; -/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. -pub use azure_core::Etag; - -pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; - /// Default concurrency limit for S3 operations /// /// Currently, sync happens with AWS S3, that has two limits on requests per second: @@ -640,8 +635,13 @@ impl GenericRemoteStorage { let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "".into()); let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); - info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", - s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); + info!( + "Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", + s3_config.bucket_name, + s3_config.bucket_region, + s3_config.prefix_in_bucket, + s3_config.endpoint + ); Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?)) } RemoteStorageKind::AzureContainer(azure_config) => { @@ -649,8 +649,12 @@ impl GenericRemoteStorage { .storage_account .as_deref() .unwrap_or(""); - info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", - azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container); + info!( + "Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", + azure_config.container_name, + azure_config.container_region, + azure_config.prefix_in_container + ); Self::AzureBlob(Arc::new(AzureBlobStorage::new( azure_config, timeout, diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index a8b00173ba..f03d6ac8ee 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -4,31 +4,26 @@ //! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. -use std::{ - collections::HashSet, - io::ErrorKind, - num::NonZeroU32, - time::{Duration, SystemTime, UNIX_EPOCH}, -}; +use std::collections::HashSet; +use std::io::ErrorKind; +use std::num::NonZeroU32; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use anyhow::{bail, ensure, Context}; +use anyhow::{Context, bail, ensure}; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use futures::stream::Stream; -use tokio::{ - fs, - io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, -}; -use tokio_util::{io::ReaderStream, sync::CancellationToken}; +use tokio::fs; +use tokio::io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use tokio_util::io::ReaderStream; +use tokio_util::sync::CancellationToken; use utils::crashsafe::path_with_suffix_extension; -use crate::{ - Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, - TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, -}; - use super::{RemoteStorage, StorageMetadata}; -use crate::Etag; +use crate::{ + Download, DownloadError, DownloadOpts, Etag, Listing, ListingMode, ListingObject, + REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, TimeTravelError, TimeoutOrCancel, +}; const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; @@ -91,7 +86,8 @@ impl LocalFs { #[cfg(test)] async fn list_all(&self) -> anyhow::Result> { - use std::{future::Future, pin::Pin}; + use std::future::Future; + use std::pin::Pin; fn get_all_files<'a, P>( directory_path: P, ) -> Pin>> + Send + Sync + 'a>> @@ -284,7 +280,9 @@ impl LocalFs { })?; if bytes_read < from_size_bytes { - bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes"); + bail!( + "Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes" + ); } // Check if there is any extra data after the given size. let mut from = buffer_to_read.into_inner(); @@ -642,10 +640,13 @@ fn mock_etag(meta: &std::fs::Metadata) -> Etag { #[cfg(test)] mod fs_tests { - use super::*; + use std::collections::HashMap; + use std::io::Write; + use std::ops::Bound; use camino_tempfile::tempdir; - use std::{collections::HashMap, io::Write, ops::Bound}; + + use super::*; async fn read_and_check_metadata( storage: &LocalFs, @@ -736,9 +737,14 @@ mod fs_tests { ); let non_existing_path = RemotePath::new(Utf8Path::new("somewhere/else"))?; - match storage.download(&non_existing_path, &DownloadOpts::default(), &cancel).await { + match storage + .download(&non_existing_path, &DownloadOpts::default(), &cancel) + .await + { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys - other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), + other => panic!( + "Should get a NotFound error when downloading non-existing storage files, but got: {other:?}" + ), } Ok(()) } diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs index 48c121fbc8..81e68e9a29 100644 --- a/libs/remote_storage/src/metrics.rs +++ b/libs/remote_storage/src/metrics.rs @@ -1,5 +1,5 @@ use metrics::{ - register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter, + Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec, }; use once_cell::sync::Lazy; @@ -16,8 +16,8 @@ pub(crate) enum RequestKind { Head = 6, } -use scopeguard::ScopeGuard; use RequestKind::*; +use scopeguard::ScopeGuard; impl RequestKind { const fn as_str(&self) -> &'static str { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index d3f19f0b11..ba7ce9e1e7 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -4,56 +4,50 @@ //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. -use std::{ - borrow::Cow, - collections::HashMap, - num::NonZeroU32, - pin::Pin, - sync::Arc, - task::{Context, Poll}, - time::{Duration, SystemTime}, -}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::num::NonZeroU32; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::{Duration, SystemTime}; -use anyhow::{anyhow, Context as _}; -use aws_config::{ - default_provider::credentials::DefaultCredentialsChain, - retry::{RetryConfigBuilder, RetryMode}, - BehaviorVersion, -}; -use aws_sdk_s3::{ - config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, - error::SdkError, - operation::{get_object::GetObjectError, head_object::HeadObjectError}, - types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, - Client, -}; +use anyhow::{Context as _, anyhow}; +use aws_config::BehaviorVersion; +use aws_config::default_provider::credentials::DefaultCredentialsChain; +use aws_config::retry::{RetryConfigBuilder, RetryMode}; +use aws_sdk_s3::Client; +use aws_sdk_s3::config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}; +use aws_sdk_s3::error::SdkError; +use aws_sdk_s3::operation::get_object::GetObjectError; +use aws_sdk_s3::operation::head_object::HeadObjectError; +use aws_sdk_s3::types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}; use aws_smithy_async::rt::sleep::TokioSleep; -use http_body_util::StreamBody; -use http_types::StatusCode; - -use aws_smithy_types::{body::SdkBody, DateTime}; -use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; +use aws_smithy_types::DateTime; +use aws_smithy_types::body::SdkBody; +use aws_smithy_types::byte_stream::ByteStream; +use aws_smithy_types::date_time::ConversionError; use bytes::Bytes; use futures::stream::Stream; use futures_util::StreamExt; +use http_body_util::StreamBody; +use http_types::StatusCode; use hyper::body::Frame; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use utils::backoff; use super::StorageMetadata; -use crate::{ - config::S3Config, - error::Cancelled, - metrics::{start_counting_cancelled_wait, start_measuring_requests}, - support::PermitCarrying, - ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, - RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3, - REMOTE_STORAGE_PREFIX_SEPARATOR, -}; - -use crate::metrics::AttemptOutcome; +use crate::config::S3Config; +use crate::error::Cancelled; pub(super) use crate::metrics::RequestKind; +use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests}; +use crate::support::PermitCarrying; +use crate::{ + ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, + MAX_KEYS_PER_DELETE_S3, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, RemoteStorage, + TimeTravelError, TimeoutOrCancel, +}; /// AWS S3 storage. pub struct S3Bucket { @@ -958,8 +952,10 @@ impl RemoteStorage for S3Bucket { version_id, key, .. } = &vd; if version_id == "null" { - return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \ - indicating either disabled versioning, or legacy objects with null version id values"))); + return Err(TimeTravelError::Other(anyhow!( + "Received ListVersions response for key={key} with version_id='null', \ + indicating either disabled versioning, or legacy objects with null version id values" + ))); } tracing::trace!( "Parsing version key={key} version_id={version_id} kind={:?}", @@ -1126,9 +1122,10 @@ impl VerOrDelete { #[cfg(test)] mod tests { - use camino::Utf8Path; use std::num::NonZeroUsize; + use camino::Utf8Path; + use crate::{RemotePath, S3Bucket, S3Config}; #[tokio::test] diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 63c24beb51..f56be873c4 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -1,14 +1,15 @@ //! This module provides a wrapper around a real RemoteStorage implementation that //! causes the first N attempts at each upload or download operatio to fail. For //! testing purposes. -use bytes::Bytes; -use futures::stream::Stream; -use futures::StreamExt; use std::collections::HashMap; +use std::collections::hash_map::Entry; use std::num::NonZeroU32; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::SystemTime; -use std::{collections::hash_map::Entry, sync::Arc}; + +use bytes::Bytes; +use futures::StreamExt; +use futures::stream::Stream; use tokio_util::sync::CancellationToken; use crate::{ diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs index 1ed9ed9305..07da38cf77 100644 --- a/libs/remote_storage/src/support.rs +++ b/libs/remote_storage/src/support.rs @@ -1,9 +1,7 @@ -use std::{ - future::Future, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; +use std::time::Duration; use bytes::Bytes; use futures_util::Stream; @@ -114,9 +112,10 @@ pub(crate) fn cancel_or_timeout( #[cfg(test)] mod tests { + use futures::stream::StreamExt; + use super::*; use crate::DownloadError; - use futures::stream::StreamExt; #[tokio::test(start_paused = true)] async fn cancelled_download_stream() { diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index d5da1d48e9..6a78ddc01e 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,19 +1,20 @@ +use std::collections::HashSet; +use std::num::NonZeroU32; +use std::ops::Bound; +use std::sync::Arc; + use anyhow::Context; use camino::Utf8Path; use futures::StreamExt; use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath}; -use std::ops::Bound; -use std::sync::Arc; -use std::{collections::HashSet, num::NonZeroU32}; use test_context::test_context; use tokio_util::sync::CancellationToken; use tracing::debug; -use crate::common::{download_to_vec, upload_stream, wrap_stream}; - use super::{ MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs, }; +use crate::common::{download_to_vec, upload_stream, wrap_stream}; /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. @@ -62,7 +63,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .into_iter() .collect::>(); assert_eq!( - root_remote_prefixes, HashSet::from([base_prefix.clone()]), + root_remote_prefixes, + HashSet::from([base_prefix.clone()]), "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" ); @@ -84,7 +86,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .difference(&nested_remote_prefixes) .collect::>(); assert_eq!( - remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), + 0, "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); @@ -119,7 +122,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .difference(&nested_remote_prefixes_combined) .collect::>(); assert_eq!( - remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), + 0, "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 15004dbf83..31c9ca3200 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -1,9 +1,9 @@ +use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; -use std::time::UNIX_EPOCH; -use std::{collections::HashSet, time::Duration}; +use std::time::{Duration, UNIX_EPOCH}; use anyhow::Context; use remote_storage::{ @@ -208,7 +208,7 @@ async fn create_azure_client( .as_millis(); // because nanos can be the same for two threads so can millis, add randomness - let random = rand::thread_rng().gen::(); + let random = rand::thread_rng().r#gen::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AzureContainer(AzureConfig { diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index e60ec18c93..6996bb27ae 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -1,13 +1,12 @@ +use std::collections::HashSet; use std::env; use std::fmt::{Debug, Display}; use std::future::Future; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; -use std::time::{Duration, UNIX_EPOCH}; -use std::{collections::HashSet, time::SystemTime}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use crate::common::{download_to_vec, upload_stream}; use anyhow::Context; use camino::Utf8Path; use futures_util::StreamExt; @@ -15,12 +14,13 @@ use remote_storage::{ DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, }; -use test_context::test_context; -use test_context::AsyncTestContext; +use test_context::{AsyncTestContext, test_context}; use tokio::io::AsyncBufReadExt; use tokio_util::sync::CancellationToken; use tracing::info; +use crate::common::{download_to_vec, upload_stream}; + mod common; #[path = "common/tests.rs"] @@ -128,8 +128,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: let t0_hwt = t0 + half_wt; let t1_hwt = t1 - half_wt; if !(t0_hwt..=t1_hwt).contains(&last_modified) { - panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ - This likely means a large lock discrepancy between S3 and the local clock."); + panic!( + "last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ + This likely means a large lock discrepancy between S3 and the local clock." + ); } } @@ -383,7 +385,7 @@ async fn create_s3_client( .as_millis(); // because nanos can be the same for two threads so can millis, add randomness - let random = rand::thread_rng().gen::(); + let random = rand::thread_rng().r#gen::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AwsS3(S3Config { diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 6b72ace019..d9d080e8fe 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "safekeeper_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs index 8b14a4f290..3d4d17096e 100644 --- a/libs/safekeeper_api/src/membership.rs +++ b/libs/safekeeper_api/src/membership.rs @@ -2,7 +2,8 @@ //! rfcs/035-safekeeper-dynamic-membership-change.md //! for details. -use std::{collections::HashSet, fmt::Display}; +use std::collections::HashSet; +use std::fmt::Display; use anyhow; use anyhow::bail; @@ -68,14 +69,12 @@ impl Display for SafekeeperId { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(transparent)] pub struct MemberSet { - pub members: Vec, + pub m: Vec, } impl MemberSet { pub fn empty() -> Self { - MemberSet { - members: Vec::new(), - } + MemberSet { m: Vec::new() } } pub fn new(members: Vec) -> anyhow::Result { @@ -83,21 +82,21 @@ impl MemberSet { if hs.len() != members.len() { bail!("duplicate safekeeper id in the set {:?}", members); } - Ok(MemberSet { members }) + Ok(MemberSet { m: members }) } - pub fn contains(&self, sk: &SafekeeperId) -> bool { - self.members.iter().any(|m| m.id == sk.id) + pub fn contains(&self, sk: NodeId) -> bool { + self.m.iter().any(|m| m.id == sk) } pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> { - if self.contains(&sk) { + if self.contains(sk.id) { bail!(format!( "sk {} is already member of the set {}", sk.id, self )); } - self.members.push(sk); + self.m.push(sk); Ok(()) } } @@ -105,11 +104,7 @@ impl MemberSet { impl Display for MemberSet { /// Display as a comma separated list of members. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let sks_str = self - .members - .iter() - .map(|m| m.to_string()) - .collect::>(); + let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::>(); write!(f, "({})", sks_str.join(", ")) } } @@ -135,6 +130,19 @@ impl Configuration { new_members: None, } } + + pub fn new(members: MemberSet) -> Self { + Configuration { + generation: INITIAL_GENERATION, + members, + new_members: None, + } + } + + /// Is `sk_id` member of the configuration? + pub fn contains(&self, sk_id: NodeId) -> bool { + self.members.contains(sk_id) || self.new_members.as_ref().is_some_and(|m| m.contains(sk_id)) + } } impl Display for Configuration { @@ -154,9 +162,10 @@ impl Display for Configuration { #[cfg(test)] mod tests { - use super::{MemberSet, SafekeeperId}; use utils::id::NodeId; + use super::{MemberSet, SafekeeperId}; + #[test] fn test_member_set() { let mut members = MemberSet::empty(); diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 41ccdaa428..6bdc651668 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -1,25 +1,24 @@ //! Types used in safekeeper http API. Many of them are also reused internally. +use std::net::SocketAddr; + use pageserver_api::shard::ShardIdentity; use postgres_ffi::TimestampTz; use serde::{Deserialize, Serialize}; -use std::net::SocketAddr; use tokio::time::Instant; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -use utils::{ - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, - pageserver_feedback::PageserverFeedback, -}; - -use crate::{membership::Configuration, ServerInfo, Term}; +use crate::membership::Configuration; +use crate::{ServerInfo, Term}; #[derive(Debug, Serialize)] pub struct SafekeeperStatus { pub id: NodeId, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, @@ -222,6 +221,11 @@ pub struct TimelineMembershipSwitchResponse { pub current_conf: Configuration, } +#[derive(Clone, Copy, Serialize, Deserialize)] +pub struct TimelineDeleteResult { + pub dir_existed: bool, +} + fn lsn_invalid() -> Lsn { Lsn::INVALID } @@ -284,7 +288,7 @@ pub struct SafekeeperUtilization { } /// pull_timeline request body. -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Clone, Deserialize, Serialize)] pub struct PullTimelineRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs index be00562219..d54876ba2c 100644 --- a/libs/tenant_size_model/src/calculation.rs +++ b/libs/tenant_size_model/src/calculation.rs @@ -130,11 +130,7 @@ impl StorageModel { break; } } - if possible { - Some(snapshot_later) - } else { - None - } + if possible { Some(snapshot_later) } else { None } } else { None }; diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs index 25ebb1c3d8..a3bc937f52 100644 --- a/libs/tenant_size_model/src/svg.rs +++ b/libs/tenant_size_model/src/svg.rs @@ -76,7 +76,10 @@ pub fn draw_svg( let mut result = String::new(); - writeln!(result, "")?; + writeln!( + result, + "" + )?; draw.calculate_svg_layout(); diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs index 2168beee88..8560d0718c 100644 --- a/libs/tracing-utils/src/http.rs +++ b/libs/tracing-utils/src/http.rs @@ -1,8 +1,8 @@ //! Tracing wrapper for Hyper HTTP server -use hyper0::HeaderMap; -use hyper0::{Body, Request, Response}; use std::future::Future; + +use hyper0::{Body, HeaderMap, Request, Response}; use tracing::Instrument; use tracing_opentelemetry::OpenTelemetrySpanExt; diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index 818d759eac..74992a7d03 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -21,7 +21,7 @@ //! .with_writer(std::io::stderr); //! //! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces -//! let otlp_layer = tracing_utils::init_tracing("my_application").await; +//! let otlp_layer = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default()).await; //! //! // Put it all together //! tracing_subscriber::registry() @@ -36,11 +36,15 @@ pub mod http; -use opentelemetry::trace::TracerProvider; use opentelemetry::KeyValue; -use tracing::Subscriber; -use tracing_subscriber::registry::LookupSpan; +use opentelemetry::trace::TracerProvider; +use opentelemetry_otlp::WithExportConfig; +pub use opentelemetry_otlp::{ExportConfig, Protocol}; +use tracing::level_filters::LevelFilter; +use tracing::{Dispatch, Subscriber}; use tracing_subscriber::Layer; +use tracing_subscriber::layer::SubscriberExt; +use tracing_subscriber::registry::LookupSpan; /// Set up OpenTelemetry exporter, using configuration from environment variables. /// @@ -69,19 +73,28 @@ use tracing_subscriber::Layer; /// /// This doesn't block, but is marked as 'async' to hint that this must be called in /// asynchronous execution context. -pub async fn init_tracing(service_name: &str) -> Option> +pub async fn init_tracing( + service_name: &str, + export_config: ExportConfig, +) -> Option> where S: Subscriber + for<'span> LookupSpan<'span>, { if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { return None; }; - Some(init_tracing_internal(service_name.to_string())) + Some(init_tracing_internal( + service_name.to_string(), + export_config, + )) } /// Like `init_tracing`, but creates a separate tokio Runtime for the tracing /// tasks. -pub fn init_tracing_without_runtime(service_name: &str) -> Option> +pub fn init_tracing_without_runtime( + service_name: &str, + export_config: ExportConfig, +) -> Option> where S: Subscriber + for<'span> LookupSpan<'span>, { @@ -112,16 +125,22 @@ where )); let _guard = runtime.enter(); - Some(init_tracing_internal(service_name.to_string())) + Some(init_tracing_internal( + service_name.to_string(), + export_config, + )) } -fn init_tracing_internal(service_name: String) -> impl Layer +fn init_tracing_internal(service_name: String, export_config: ExportConfig) -> impl Layer where S: Subscriber + for<'span> LookupSpan<'span>, { - // Sets up exporter from the OTEL_EXPORTER_* environment variables. + // Sets up exporter from the provided [`ExportConfig`] parameter. + // If the endpoint is not specified, it is loaded from the + // OTEL_EXPORTER_OTLP_ENDPOINT environment variable. let exporter = opentelemetry_otlp::SpanExporter::builder() .with_http() + .with_export_config(export_config) .build() .expect("could not initialize opentelemetry exporter"); @@ -151,3 +170,51 @@ where pub fn shutdown_tracing() { opentelemetry::global::shutdown_tracer_provider(); } + +pub enum OtelEnablement { + Disabled, + Enabled { + service_name: String, + export_config: ExportConfig, + runtime: &'static tokio::runtime::Runtime, + }, +} + +pub struct OtelGuard { + pub dispatch: Dispatch, +} + +impl Drop for OtelGuard { + fn drop(&mut self) { + shutdown_tracing(); + } +} + +/// Initializes OTEL infrastructure for performance tracing according to the provided configuration +/// +/// Performance tracing is handled by a different [`tracing::Subscriber`]. This functions returns +/// an [`OtelGuard`] containing a [`tracing::Dispatch`] associated with a newly created subscriber. +/// Applications should use this dispatch for their performance traces. +/// +/// The lifetime of the guard should match taht of the application. On drop, it tears down the +/// OTEL infra. +pub fn init_performance_tracing(otel_enablement: OtelEnablement) -> Option { + let otel_subscriber = match otel_enablement { + OtelEnablement::Disabled => None, + OtelEnablement::Enabled { + service_name, + export_config, + runtime, + } => { + let otel_layer = runtime + .block_on(init_tracing(&service_name, export_config)) + .with_filter(LevelFilter::INFO); + let otel_subscriber = tracing_subscriber::registry().with(otel_layer); + let otel_dispatch = Dispatch::new(otel_subscriber); + + Some(otel_dispatch) + } + }; + + otel_subscriber.map(|dispatch| OtelGuard { dispatch }) +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 5020d82adf..4180602ac7 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -15,7 +15,6 @@ arc-swap.workspace = true sentry.workspace = true async-compression.workspace = true anyhow.workspace = true -backtrace.workspace = true bincode.workspace = true bytes.workspace = true camino.workspace = true @@ -43,6 +42,7 @@ toml_edit = { workspace = true, features = ["serde"] } tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } +tracing-utils.workspace = true rand.workspace = true scopeguard.workspace = true strum.workspace = true diff --git a/libs/utils/benches/README.md b/libs/utils/benches/README.md index e23ec268c2..5afbe3cf2b 100644 --- a/libs/utils/benches/README.md +++ b/libs/utils/benches/README.md @@ -10,14 +10,14 @@ cargo bench --package utils cargo bench --package utils --bench benchmarks # Specific benchmark. -cargo bench --package utils --bench benchmarks warn_slow/enabled=true +cargo bench --package utils --bench benchmarks log_slow/enabled=true # List available benchmarks. cargo bench --package utils --benches -- --list # Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. # Output in target/criterion/*/profile/flamegraph.svg. -cargo bench --package utils --bench benchmarks warn_slow/enabled=true --profile-time 10 +cargo bench --package utils --bench benchmarks log_slow/enabled=true --profile-time 10 ``` Additional charts and statistics are available in `target/criterion/report/index.html`. diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index cff3792f3a..35f3baaed1 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,16 +1,16 @@ use std::time::Duration; -use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use pprof::criterion::{Output, PProfProfiler}; use utils::id; -use utils::logging::warn_slow; +use utils::logging::log_slow; // Register benchmarks with Criterion. criterion_group!( name = benches; config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); targets = bench_id_stringify, - bench_warn_slow, + bench_log_slow, ); criterion_main!(benches); @@ -29,9 +29,9 @@ pub fn bench_id_stringify(c: &mut Criterion) { }); } -pub fn bench_warn_slow(c: &mut Criterion) { +pub fn bench_log_slow(c: &mut Criterion) { for enabled in [false, true] { - c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| { + c.bench_function(&format!("log_slow/enabled={enabled}"), |b| { run_bench(b, enabled).unwrap() }); } @@ -45,11 +45,17 @@ pub fn bench_warn_slow(c: &mut Criterion) { .enable_all() .build()?; - // Test both with and without warn_slow, since we're essentially measuring Tokio scheduling + // Test both with and without log_slow, since we're essentially measuring Tokio scheduling // performance too. Use a simple noop future that yields once, to avoid any scheduler fast // paths for a ready future. if enabled { - b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now()))); + b.iter(|| { + runtime.block_on(log_slow( + "ready", + THRESHOLD, + std::pin::pin!(tokio::task::yield_now()), + )) + }); } else { b.iter(|| runtime.block_on(tokio::task::yield_now())); } diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 4bfd0ab055..cc5b0b1d13 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -1,12 +1,15 @@ // For details about authentication see docs/authentication.md -use arc_swap::ArcSwap; -use std::{borrow::Cow, fmt::Display, fs, sync::Arc}; +use std::borrow::Cow; +use std::fmt::Display; +use std::fs; +use std::sync::Arc; use anyhow::Result; +use arc_swap::ArcSwap; use camino::Utf8Path; use jsonwebtoken::{ - decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, + Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode, }; use serde::{Deserialize, Serialize}; @@ -129,7 +132,9 @@ impl JwtAuth { anyhow::bail!("path is neither a directory or a file") }; if decoding_keys.is_empty() { - anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected."); + anyhow::bail!( + "Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected." + ); } Ok(Self::new(decoding_keys)) } @@ -175,9 +180,10 @@ pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result #[cfg(test)] mod tests { - use super::*; use std::str::FromStr; + use super::*; + // Generated with: // // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem @@ -215,7 +221,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw"; // Check it can be validated with the public key - let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); + let auth = JwtAuth::new(vec![ + DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(), + ]); let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims; assert_eq!(claims_from_token, expected_claims); } @@ -230,7 +238,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap(); // decode it back - let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); + let auth = JwtAuth::new(vec![ + DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(), + ]); let decoded = auth.decode(&encoded).unwrap(); assert_eq!(decoded.claims, claims); diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs index e6503fe377..4a4c4eedbb 100644 --- a/libs/utils/src/backoff.rs +++ b/libs/utils/src/backoff.rs @@ -121,10 +121,12 @@ where #[cfg(test)] mod tests { - use super::*; use std::io; + use tokio::sync::Mutex; + use super::*; + #[test] fn backoff_defaults_produce_growing_backoff_sequence() { let mut current_backoff_value = None; diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 4d173d0726..2861baeee5 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -13,9 +13,11 @@ #![warn(missing_docs)] -use bincode::Options; -use serde::{de::DeserializeOwned, Serialize}; use std::io::{self, Read, Write}; + +use bincode::Options; +use serde::Serialize; +use serde::de::DeserializeOwned; use thiserror::Error; /// An error that occurred during a deserialize operation @@ -261,10 +263,12 @@ impl LeSer for T {} #[cfg(test)] mod tests { - use super::DeserializeError; - use serde::{Deserialize, Serialize}; use std::io::Cursor; + use serde::{Deserialize, Serialize}; + + use super::DeserializeError; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ShortStruct { a: u8, diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs index e1ddfd8650..46a6584d66 100644 --- a/libs/utils/src/circuit_breaker.rs +++ b/libs/utils/src/circuit_breaker.rs @@ -1,7 +1,5 @@ -use std::{ - fmt::Display, - time::{Duration, Instant}, -}; +use std::fmt::Display; +use std::time::{Duration, Instant}; use metrics::IntCounter; diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index f65c080ad4..973d754715 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -1,4 +1,5 @@ -use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; +use tokio_util::task::TaskTracker; +use tokio_util::task::task_tracker::TaskTrackerToken; /// While a reference is kept around, the associated [`Barrier::wait`] will wait. /// diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 5241ab183c..290a5b2686 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,9 +1,7 @@ +use std::borrow::Cow; +use std::fs::{self, File}; +use std::io::{self, Write}; use std::os::fd::AsRawFd; -use std::{ - borrow::Cow, - fs::{self, File}, - io::{self, Write}, -}; use camino::{Utf8Path, Utf8PathBuf}; diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs index a1bcec9229..2a85f54a01 100644 --- a/libs/utils/src/env.rs +++ b/libs/utils/src/env.rs @@ -1,6 +1,7 @@ //! Wrapper around `std::env::var` for parsing environment variables. -use std::{fmt::Display, str::FromStr}; +use std::fmt::Display; +use std::str::FromStr; /// For types `V` that implement [`FromStr`]. pub fn var(varname: &str) -> Option diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index fc998ad9a9..ce014eb0ac 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -127,6 +127,9 @@ pub async fn failpoint_sleep_cancellable_helper( tracing::info!("failpoint {:?}: sleep done", name); } +/// Initialize the configured failpoints +/// +/// You must call this function before any concurrent threads do operations. pub fn init() -> fail::FailScenario<'static> { // The failpoints lib provides support for parsing the `FAILPOINTS` env var. // We want non-default behavior for `exit`, though, so, we handle it separately. @@ -134,7 +137,10 @@ pub fn init() -> fail::FailScenario<'static> { // Format for FAILPOINTS is "name=actions" separated by ";". let actions = std::env::var("FAILPOINTS"); if actions.is_ok() { - std::env::remove_var("FAILPOINTS"); + // SAFETY: this function should before any threads start and access env vars concurrently + unsafe { + std::env::remove_var("FAILPOINTS"); + } } else { // let the library handle non-utf8, or nothing for not present } diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index 8e53d2c79b..a406ab0378 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -58,9 +58,8 @@ where #[cfg(test)] mod test { - use crate::fs_ext::{is_directory_empty, list_dir}; - use super::ignore_absent_files; + use crate::fs_ext::{is_directory_empty, list_dir}; #[test] fn is_empty_dir() { diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs index 897e30d7f1..fc6f794b57 100644 --- a/libs/utils/src/fs_ext/rename_noreplace.rs +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -38,7 +38,8 @@ pub fn rename_noreplace( #[cfg(test)] mod test { - use std::{fs, path::PathBuf}; + use std::fs; + use std::path::PathBuf; use super::*; diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 44565ee6a2..b5e4a4644a 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -169,9 +169,9 @@ mod test { ]; let mut s = String::new(); - for (line, gen, expected) in examples { + for (line, gen_, expected) in examples { s.clear(); - write!(s, "{}", &gen.get_suffix()).expect("string grows"); + write!(s, "{}", &gen_.get_suffix()).expect("string grows"); assert_eq!(s, expected, "example on {line}"); } } diff --git a/libs/utils/src/guard_arc_swap.rs b/libs/utils/src/guard_arc_swap.rs index cec5202460..26cd640d3b 100644 --- a/libs/utils/src/guard_arc_swap.rs +++ b/libs/utils/src/guard_arc_swap.rs @@ -1,8 +1,9 @@ //! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes //! don't block reads. -use arc_swap::ArcSwap; use std::sync::Arc; + +use arc_swap::ArcSwap; use tokio::sync::TryLockError; pub struct GuardArcSwap { diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index eb91839504..6016c23a01 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -1,5 +1,6 @@ +use std::fmt; use std::num::ParseIntError; -use std::{fmt, str::FromStr}; +use std::str::FromStr; use anyhow::Context; use hex::FromHex; @@ -215,7 +216,7 @@ macro_rules! id_newtype { impl AsRef<[u8]> for $t { fn as_ref(&self) -> &[u8] { - &self.0 .0 + &self.0.0 } } @@ -367,9 +368,8 @@ impl FromStr for NodeId { mod tests { use serde_assert::{Deserializer, Serializer, Token, Tokens}; - use crate::bin_ser::BeSer; - use super::*; + use crate::bin_ser::BeSer; #[test] fn test_id_serde_non_human_readable() { diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs index 0cc58738c0..2398f92766 100644 --- a/libs/utils/src/leaky_bucket.rs +++ b/libs/utils/src/leaky_bucket.rs @@ -21,15 +21,12 @@ //! //! Another explaination can be found here: -use std::{ - sync::{ - atomic::{AtomicU64, Ordering}, - Mutex, - }, - time::Duration, -}; +use std::sync::Mutex; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Duration; -use tokio::{sync::Notify, time::Instant}; +use tokio::sync::Notify; +use tokio::time::Instant; pub struct LeakyBucketConfig { /// This is the "time cost" of a single request unit. diff --git a/libs/utils/src/linux_socket_ioctl.rs b/libs/utils/src/linux_socket_ioctl.rs index 5ae0e86af8..766529838c 100644 --- a/libs/utils/src/linux_socket_ioctl.rs +++ b/libs/utils/src/linux_socket_ioctl.rs @@ -2,21 +2,23 @@ //! //! -use std::{ - io, - mem::MaybeUninit, - os::{fd::RawFd, raw::c_int}, -}; +use std::io; +use std::mem::MaybeUninit; +use std::os::fd::RawFd; +use std::os::raw::c_int; use nix::libc::{FIONREAD, TIOCOUTQ}; unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result { let mut inq: MaybeUninit = MaybeUninit::uninit(); - let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); - if err == 0 { - Ok(inq.assume_init()) - } else { - Err(io::Error::last_os_error()) + // SAFETY: encapsulating fn is unsafe, we require `socket_fd` to be a valid file descriptor + unsafe { + let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); + if err == 0 { + Ok(inq.assume_init()) + } else { + Err(io::Error::last_os_error()) + } } } @@ -24,12 +26,14 @@ unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result /// /// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. pub unsafe fn inq(socket_fd: RawFd) -> io::Result { - do_ioctl(socket_fd, FIONREAD) + // SAFETY: encapsulating fn is unsafe + unsafe { do_ioctl(socket_fd, FIONREAD) } } /// # Safety /// /// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. pub unsafe fn outq(socket_fd: RawFd) -> io::Result { - do_ioctl(socket_fd, TIOCOUTQ) + // SAFETY: encapsulating fn is unsafe + unsafe { do_ioctl(socket_fd, TIOCOUTQ) } } diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 3a2ed3e830..6aeeeca021 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -6,16 +6,15 @@ //! there for potential pitfalls with lock files that are used //! to store PIDs (pidfiles). -use std::{ - fs, - io::{Read, Write}, - ops::Deref, - os::unix::prelude::AsRawFd, -}; +use std::fs; +use std::io::{Read, Write}; +use std::ops::Deref; +use std::os::unix::prelude::AsRawFd; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use nix::{errno::Errno::EAGAIN, fcntl}; +use nix::errno::Errno::EAGAIN; +use nix::fcntl; use crate::crashsafe; diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 95c69ac8ba..0ac8201795 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -7,7 +7,7 @@ use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; use tokio::time::Instant; -use tracing::warn; +use tracing::info; /// Logs a critical error, similarly to `tracing::error!`. This will: /// @@ -165,6 +165,7 @@ pub fn init( }; log_layer.with_filter(rust_log_env_filter()) }); + let r = r.with( TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()), ); @@ -273,7 +274,9 @@ fn log_panic_to_stderr( location: Option>, backtrace: &std::backtrace::Backtrace, ) { - eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}"); + eprintln!( + "panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}" + ); } struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>); @@ -322,44 +325,100 @@ impl std::fmt::Debug for SecretString { } } -/// Logs a periodic warning if a future is slow to complete. +/// Logs a periodic message if a future is slow to complete. /// /// This is performance-sensitive as it's used on the GetPage read path. +/// +/// TODO: consider upgrading this to a warning, but currently it fires too often. #[inline] -pub async fn warn_slow(name: &str, threshold: Duration, f: impl Future) -> O { - // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and - // won't fit on the stack. - let mut f = Box::pin(f); +pub async fn log_slow(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O +where + F: Future, +{ + monitor_slow_future( + threshold, + threshold, // period = threshold + f, + |MonitorSlowFutureCallback { + ready, + is_slow, + elapsed_total, + elapsed_since_last_callback: _, + }| { + if !is_slow { + return; + } + if ready { + info!( + "slow {name} completed after {:.3}s", + elapsed_total.as_secs_f64() + ); + } else { + info!( + "slow {name} still running after {:.3}s", + elapsed_total.as_secs_f64() + ); + } + }, + ) + .await +} +/// Poll future `fut` to completion, invoking callback `cb` at the given `threshold` and every +/// `period` afterwards, and also unconditionally when the future completes. +#[inline] +pub async fn monitor_slow_future( + threshold: Duration, + period: Duration, + mut fut: std::pin::Pin<&mut F>, + mut cb: impl FnMut(MonitorSlowFutureCallback), +) -> O +where + F: Future, +{ let started = Instant::now(); let mut attempt = 1; - + let mut last_cb = started; loop { // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common // case where the timeout doesn't fire. - let deadline = started + attempt * threshold; - if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await { - // NB: we check if we exceeded the threshold even if the timeout never fired, because - // scheduling or execution delays may cause the future to succeed even if it exceeds the - // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid - // false negatives. - let elapsed = started.elapsed(); - if elapsed >= threshold { - warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64()); - } + let deadline = started + threshold + (attempt - 1) * period; + // TODO: still call the callback if the future panics? Copy how we do it for the page_service flush_in_progress counter. + let res = tokio::time::timeout_at(deadline, &mut fut).await; + let now = Instant::now(); + let elapsed_total = now - started; + cb(MonitorSlowFutureCallback { + ready: res.is_ok(), + is_slow: elapsed_total >= threshold, + elapsed_total, + elapsed_since_last_callback: now - last_cb, + }); + last_cb = now; + if let Ok(output) = res { return output; } - - let elapsed = started.elapsed().as_secs_f64(); - warn!("slow {name} still running after {elapsed:.3}s",); - attempt += 1; } } +/// See [`monitor_slow_future`]. +pub struct MonitorSlowFutureCallback { + /// Whether the future completed. If true, there will be no more callbacks. + pub ready: bool, + /// Whether the future is taking `>=` the specififed threshold duration to complete. + /// Monotonic: if true in one callback invocation, true in all subsequent onces. + pub is_slow: bool, + /// The time elapsed since the [`monitor_slow_future`] was first polled. + pub elapsed_total: Duration, + /// The time elapsed since the last callback invocation. + /// For the initial callback invocation, the time elapsed since the [`monitor_slow_future`] was first polled. + pub elapsed_since_last_callback: Duration, +} + #[cfg(test)] mod tests { - use metrics::{core::Opts, IntCounterVec}; + use metrics::IntCounterVec; + use metrics::core::Opts; use crate::logging::{TracingEventCountLayer, TracingEventCountMetric}; diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index c874fa30ff..31e1dda23d 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -1,11 +1,13 @@ #![warn(missing_docs)] -use serde::{de::Visitor, Deserialize, Serialize}; use std::fmt; use std::ops::{Add, AddAssign}; use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; +use serde::de::Visitor; +use serde::{Deserialize, Serialize}; + use crate::seqwait::MonotonicCounter; /// Transaction log block size in bytes @@ -407,11 +409,10 @@ impl rand::distributions::uniform::UniformSampler for LsnSampler { #[cfg(test)] mod tests { - use crate::bin_ser::BeSer; + use serde_assert::{Deserializer, Serializer, Token, Tokens}; use super::*; - - use serde_assert::{Deserializer, Serializer, Token, Tokens}; + use crate::bin_ser::BeSer; #[test] fn test_lsn_strings() { diff --git a/libs/utils/src/measured_stream.rs b/libs/utils/src/measured_stream.rs index c82fc13109..8e67df3158 100644 --- a/libs/utils/src/measured_stream.rs +++ b/libs/utils/src/measured_stream.rs @@ -1,7 +1,8 @@ -use pin_project_lite::pin_project; use std::io::Read; use std::pin::Pin; use std::{io, task}; + +use pin_project_lite::pin_project; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; pin_project! { diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs index dede65e699..cffbc0b4d6 100644 --- a/libs/utils/src/pageserver_feedback.rs +++ b/libs/utils/src/pageserver_feedback.rs @@ -1,7 +1,7 @@ use std::time::{Duration, SystemTime}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use pq_proto::{read_cstr, PG_EPOCH}; +use pq_proto::{PG_EPOCH, read_cstr}; use serde::{Deserialize, Serialize}; use tracing::{trace, warn}; diff --git a/libs/utils/src/postgres_client.rs b/libs/utils/src/postgres_client.rs index a62568202b..4167839e28 100644 --- a/libs/utils/src/postgres_client.rs +++ b/libs/utils/src/postgres_client.rs @@ -3,7 +3,7 @@ //! postgres_connection crate. use anyhow::Context; -use postgres_connection::{parse_host_port, PgConnectionConfig}; +use postgres_connection::{PgConnectionConfig, parse_host_port}; use crate::id::TenantTimelineId; diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs index f3f8f219e3..945f710b1d 100644 --- a/libs/utils/src/rate_limit.rs +++ b/libs/utils/src/rate_limit.rs @@ -53,10 +53,11 @@ mod tests { #[test] fn basics() { - use super::RateLimit; use std::sync::atomic::Ordering::Relaxed; use std::time::Duration; + use super::RateLimit; + let called = AtomicUsize::new(0); let mut f = RateLimit::new(Duration::from_millis(100)); diff --git a/libs/utils/src/sentry_init.rs b/libs/utils/src/sentry_init.rs index 992cb5c671..72d192a591 100644 --- a/libs/utils/src/sentry_init.rs +++ b/libs/utils/src/sentry_init.rs @@ -1,22 +1,26 @@ -use sentry::ClientInitGuard; use std::borrow::Cow; use std::env; +use sentry::ClientInitGuard; pub use sentry::release_name; +use tracing::{error, info}; #[must_use] pub fn init_sentry( release_name: Option>, extra_options: &[(&str, &str)], ) -> Option { - let dsn = env::var("SENTRY_DSN").ok()?; + let Ok(dsn) = env::var("SENTRY_DSN") else { + info!("not initializing Sentry, no SENTRY_DSN given"); + return None; + }; let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into()); let guard = sentry::init(( dsn, sentry::ClientOptions { - release: release_name, - environment: Some(environment.into()), + release: release_name.clone(), + environment: Some(environment.clone().into()), ..Default::default() }, )); @@ -25,5 +29,19 @@ pub fn init_sentry( scope.set_extra(key, value.into()); } }); + + if let Some(dsn) = guard.dsn() { + info!( + "initialized Sentry for project {}, environment {}, release {} (using API {})", + dsn.project_id(), + environment, + release_name.unwrap_or(Cow::Borrowed("None")), + dsn.envelope_api_url(), + ); + } else { + // This should panic during sentry::init(), but we may as well cover it. + error!("failed to initialize Sentry, invalid DSN"); + } + Some(guard) } diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index d99dc25769..3c4c7f882d 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -5,6 +5,7 @@ use std::collections::BinaryHeap; use std::mem; use std::sync::Mutex; use std::time::Duration; + use tokio::sync::watch::{self, channel}; use tokio::time::timeout; @@ -248,11 +249,7 @@ where let internal = self.internal.lock().unwrap(); let cnt = internal.current.cnt_value(); drop(internal); - if cnt >= num { - Ok(()) - } else { - Err(cnt) - } + if cnt >= num { Ok(()) } else { Err(cnt) } } /// Register and return a channel that will be notified when a number arrives, @@ -325,9 +322,10 @@ where #[cfg(test)] mod tests { - use super::*; use std::sync::Arc; + use super::*; + impl MonotonicCounter for i32 { fn cnt_advance(&mut self, val: i32) { assert!(*self <= val); diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs index 36e874a161..ca1e7aa25b 100644 --- a/libs/utils/src/serde_percent.rs +++ b/libs/utils/src/serde_percent.rs @@ -12,11 +12,7 @@ pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8); impl Percent { pub const fn new(pct: u8) -> Option { - if pct <= 100 { - Some(Percent(pct)) - } else { - None - } + if pct <= 100 { Some(Percent(pct)) } else { None } } pub fn get(&self) -> u8 { diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index d98284f969..c8c410a725 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -1,6 +1,7 @@ //! See `pageserver_api::shard` for description on sharding. -use std::{ops::RangeInclusive, str::FromStr}; +use std::ops::RangeInclusive; +use std::str::FromStr; use hex::FromHex; use serde::{Deserialize, Serialize}; @@ -59,11 +60,7 @@ impl ShardCount { /// This method returns the actual number of shards, i.e. if our internal value is /// zero, we return 1 (unsharded tenants have 1 shard). pub fn count(&self) -> u8 { - if self.0 > 0 { - self.0 - } else { - 1 - } + if self.0 > 0 { self.0 } else { 1 } } /// The literal internal value: this is **not** the number of shards in the diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index c37e9aea58..f2be1957c4 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -1,7 +1,7 @@ +pub use signal_hook::consts::TERM_SIGNALS; +pub use signal_hook::consts::signal::*; use signal_hook::iterator::Signals; -pub use signal_hook::consts::{signal::*, TERM_SIGNALS}; - pub enum Signal { Quit, Interrupt, diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index 6700f86e4a..fabdf9df46 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -44,8 +44,7 @@ #![warn(missing_docs)] use std::ops::Deref; -use std::sync::{Arc, Weak}; -use std::sync::{RwLock, RwLockWriteGuard}; +use std::sync::{Arc, RwLock, RwLockWriteGuard, Weak}; use tokio::sync::watch; @@ -219,10 +218,11 @@ impl RcuWaitList { #[cfg(test)] mod tests { - use super::*; use std::sync::Mutex; use std::time::Duration; + use super::*; + #[tokio::test] async fn two_writers() { let rcu = Rcu::new(1); diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 0a1ed81621..93460785bf 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -1,10 +1,6 @@ -use std::{ - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - time::Duration, -}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; /// Gates are a concurrency helper, primarily used for implementing safe shutdown. /// diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 66c2065554..8f8401b35d 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -1,7 +1,6 @@ -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, Mutex, MutexGuard, -}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex, MutexGuard}; + use tokio::sync::Semaphore; /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of @@ -301,14 +300,13 @@ impl Drop for InitPermit { #[cfg(test)] mod tests { + use std::convert::Infallible; + use std::pin::{Pin, pin}; + use std::time::Duration; + use futures::Future; use super::*; - use std::{ - convert::Infallible, - pin::{pin, Pin}, - time::Duration, - }; #[tokio::test] async fn many_initializers() { diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs index 0cab291d51..7dfbf40411 100644 --- a/libs/utils/src/sync/spsc_fold.rs +++ b/libs/utils/src/sync/spsc_fold.rs @@ -1,4 +1,5 @@ -use core::{future::poll_fn, task::Poll}; +use core::future::poll_fn; +use core::task::Poll; use std::sync::{Arc, Mutex}; use diatomic_waker::DiatomicWaker; diff --git a/libs/utils/src/tcp_listener.rs b/libs/utils/src/tcp_listener.rs index 6b35d3d63a..6a4a77127d 100644 --- a/libs/utils/src/tcp_listener.rs +++ b/libs/utils/src/tcp_listener.rs @@ -1,9 +1,8 @@ -use std::{ - io, - net::{TcpListener, ToSocketAddrs}, -}; +use std::io; +use std::net::{TcpListener, ToSocketAddrs}; -use nix::sys::socket::{setsockopt, sockopt::ReuseAddr}; +use nix::sys::socket::setsockopt; +use nix::sys::socket::sockopt::ReuseAddr; /// Bind a [`TcpListener`] to addr with `SO_REUSEADDR` set to true. pub fn bind(addr: A) -> io::Result { diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index add2fa7920..3d15e08400 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -172,16 +172,14 @@ fn tracing_subscriber_configured() -> bool { #[cfg(test)] mod tests { + use std::collections::HashSet; + use std::fmt::{self}; + use std::hash::{Hash, Hasher}; + use tracing_subscriber::prelude::*; use super::*; - use std::{ - collections::HashSet, - fmt::{self}, - hash::{Hash, Hasher}, - }; - struct MemoryIdentity<'a>(&'a dyn Extractor); impl MemoryIdentity<'_> { diff --git a/libs/utils/src/try_rcu.rs b/libs/utils/src/try_rcu.rs index 6b53ab1316..30540c27d0 100644 --- a/libs/utils/src/try_rcu.rs +++ b/libs/utils/src/try_rcu.rs @@ -44,10 +44,12 @@ where #[cfg(test)] mod tests { - use super::*; - use arc_swap::ArcSwap; use std::sync::Arc; + use arc_swap::ArcSwap; + + use super::*; + #[test] fn test_try_rcu_success() { let swap = ArcSwap::from(Arc::new(42)); diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 1fe048c6f0..eded86af3e 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -1,4 +1,6 @@ -use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds}; +use std::alloc::Layout; +use std::cmp::Ordering; +use std::ops::RangeBounds; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum VecMapOrdering { @@ -214,7 +216,8 @@ fn extract_key(entry: &(K, V)) -> &K { #[cfg(test)] mod tests { - use std::{collections::BTreeMap, ops::Bound}; + use std::collections::BTreeMap; + use std::ops::Bound; use super::{VecMap, VecMapOrdering}; diff --git a/libs/utils/src/zstd.rs b/libs/utils/src/zstd.rs index be2dcc00f5..96c2a83951 100644 --- a/libs/utils/src/zstd.rs +++ b/libs/utils/src/zstd.rs @@ -1,19 +1,14 @@ use std::io::SeekFrom; use anyhow::{Context, Result}; -use async_compression::{ - tokio::{bufread::ZstdDecoder, write::ZstdEncoder}, - zstd::CParameter, - Level, -}; +use async_compression::Level; +use async_compression::tokio::bufread::ZstdDecoder; +use async_compression::tokio::write::ZstdEncoder; +use async_compression::zstd::CParameter; use camino::Utf8Path; use nix::NixPath; -use tokio::{ - fs::{File, OpenOptions}, - io::AsyncBufRead, - io::AsyncSeekExt, - io::AsyncWriteExt, -}; +use tokio::fs::{File, OpenOptions}; +use tokio::io::{AsyncBufRead, AsyncSeekExt, AsyncWriteExt}; use tokio_tar::{Archive, Builder, HeaderMode}; use walkdir::WalkDir; diff --git a/libs/utils/tests/bin_ser_test.rs b/libs/utils/tests/bin_ser_test.rs index b995b61b78..e0c8cdde00 100644 --- a/libs/utils/tests/bin_ser_test.rs +++ b/libs/utils/tests/bin_ser_test.rs @@ -1,7 +1,8 @@ +use std::io::Read; + use bytes::{Buf, BytesMut}; use hex_literal::hex; use serde::Deserialize; -use std::io::Read; use utils::bin_ser::LeSer; #[derive(Debug, PartialEq, Eq, Deserialize)] diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs index 846904cf87..ed6ba4d267 100644 --- a/libs/wal_decoder/benches/bench_interpret_wal.rs +++ b/libs/wal_decoder/benches/bench_interpret_wal.rs @@ -1,23 +1,25 @@ -use anyhow::Context; -use criterion::{criterion_group, criterion_main, Criterion}; -use futures::{stream::FuturesUnordered, StreamExt}; -use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; -use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; -use pprof::criterion::{Output, PProfProfiler}; -use serde::Deserialize; -use std::{env, num::NonZeroUsize, sync::Arc}; +use std::env; +use std::num::NonZeroUsize; +use std::sync::Arc; +use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use camino_tempfile::Utf8TempDir; +use criterion::{Criterion, criterion_group, criterion_main}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use pprof::criterion::{Output, PProfProfiler}; use remote_storage::{ DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind, S3Config, }; +use serde::Deserialize; use tokio_util::sync::CancellationToken; -use utils::{ - lsn::Lsn, - shard::{ShardCount, ShardNumber}, -}; +use utils::lsn::Lsn; +use utils::shard::{ShardCount, ShardNumber}; use wal_decoder::models::InterpretedWalRecord; const S3_BUCKET: &str = "neon-github-public-dev"; @@ -31,7 +33,7 @@ const METADATA_FILENAME: &str = "metadata.json"; static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; async fn create_s3_client() -> anyhow::Result> { diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs index ebb38ceb52..cb0835e894 100644 --- a/libs/wal_decoder/src/decoder.rs +++ b/libs/wal_decoder/src/decoder.rs @@ -3,8 +3,6 @@ use std::collections::HashMap; -use crate::models::*; -use crate::serialized_batch::SerializedValueBatch; use bytes::{Buf, Bytes}; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -14,6 +12,9 @@ use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::walrecord::*; use utils::lsn::Lsn; +use crate::models::*; +use crate::serialized_batch::SerializedValueBatch; + impl InterpretedWalRecord { /// Decode and interpreted raw bytes which represent one Postgres WAL record. /// Data blocks which do not match any of the provided shard identities are filtered out. diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs index d76f75f51f..b451d6d8e0 100644 --- a/libs/wal_decoder/src/serialized_batch.rs +++ b/libs/wal_decoder/src/serialized_batch.rs @@ -8,20 +8,18 @@ use std::collections::{BTreeSet, HashMap}; use bytes::{Bytes, BytesMut}; -use pageserver_api::key::rel_block_to_key; +use pageserver_api::key::{CompactKey, Key, rel_block_to_key}; use pageserver_api::keyspace::KeySpace; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIdentity; -use pageserver_api::{key::CompactKey, value::Value}; +use pageserver_api::value::Value; use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord}; -use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ}; +use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn, pg_constants}; use serde::{Deserialize, Serialize}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; -use pageserver_api::key::Key; - use crate::models::InterpretedWalRecord; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); @@ -515,10 +513,11 @@ impl SerializedValueBatch { let empty = self.raw.is_empty(); if cfg!(debug_assertions) && empty { - assert!(self - .metadata - .iter() - .all(|meta| matches!(meta, ValueMeta::Observed(_)))); + assert!( + self.metadata + .iter() + .all(|meta| matches!(meta, ValueMeta::Observed(_))) + ); } !empty diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs index 52ed5c70b5..5a28128dd8 100644 --- a/libs/wal_decoder/src/wire_format.rs +++ b/libs/wal_decoder/src/wire_format.rs @@ -7,15 +7,12 @@ use utils::lsn::Lsn; use utils::postgres_client::{Compression, InterpretedFormat}; use crate::models::{ - FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords, MetadataRecord, + FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords, MetadataRecord, proto, }; - use crate::serialized_batch::{ ObservedValueMeta, SerializedValueBatch, SerializedValueMeta, ValueMeta, }; -use crate::models::proto; - #[derive(Debug, thiserror::Error)] pub enum ToWireFormatError { #[error("{0}")] @@ -83,8 +80,8 @@ impl ToWireFormat for InterpretedWalRecords { format: InterpretedFormat, compression: Option, ) -> Result { - use async_compression::tokio::write::ZstdEncoder; use async_compression::Level; + use async_compression::tokio::write::ZstdEncoder; let encode_res: Result = match format { InterpretedFormat::Bincode => { diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 8d5b1ade35..530ceb1327 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -1,9 +1,11 @@ //! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h //! to generate Rust bindings for it. -use std::{env, path::PathBuf, process::Command}; +use std::env; +use std::path::PathBuf; +use std::process::Command; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; const WALPROPOSER_PG_VERSION: &str = "v17"; diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 2fbea3fe45..d660602149 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -3,27 +3,14 @@ #![allow(dead_code)] -use std::ffi::CStr; -use std::ffi::CString; +use std::ffi::{CStr, CString}; -use crate::bindings::uint32; -use crate::bindings::walproposer_api; -use crate::bindings::NeonWALReadResult; -use crate::bindings::PGAsyncReadResult; -use crate::bindings::PGAsyncWriteResult; -use crate::bindings::Safekeeper; -use crate::bindings::Size; -use crate::bindings::StringInfoData; -use crate::bindings::TimestampTz; -use crate::bindings::WalProposer; -use crate::bindings::WalProposerConnStatusType; -use crate::bindings::WalProposerConnectPollStatusType; -use crate::bindings::WalProposerExecStatusType; -use crate::bindings::WalproposerShmemState; -use crate::bindings::XLogRecPtr; -use crate::walproposer::ApiImpl; -use crate::walproposer::StreamingCallback; -use crate::walproposer::WaitResult; +use crate::bindings::{ + NeonWALReadResult, PGAsyncReadResult, PGAsyncWriteResult, Safekeeper, Size, StringInfoData, + TimestampTz, WalProposer, WalProposerConnStatusType, WalProposerConnectPollStatusType, + WalProposerExecStatusType, WalproposerShmemState, XLogRecPtr, uint32, walproposer_api, +}; +use crate::walproposer::{ApiImpl, StreamingCallback, WaitResult}; extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState { unsafe { diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index ba75171db2..4e50c21fca 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -2,15 +2,15 @@ use std::ffi::CString; -use crate::{ - api_bindings::{create_api, take_vec_u8, Level}, - bindings::{ - NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig, - WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, - }, -}; use postgres_ffi::WAL_SEGMENT_SIZE; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use crate::api_bindings::{Level, create_api, take_vec_u8}; +use crate::bindings::{ + NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig, + WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, +}; /// Rust high-level wrapper for C walproposer API. Many methods are not required /// for simple cases, hence todo!() in default implementations. @@ -215,6 +215,7 @@ impl Wrapper { syncSafekeepers: config.sync_safekeepers, systemId: 0, pgTimeline: 1, + proto_version: 3, callback_data, }; let c_config = Box::into_raw(Box::new(c_config)); @@ -274,21 +275,17 @@ impl StreamingCallback { #[cfg(test)] mod tests { use core::panic; - use std::{ - cell::Cell, - sync::{atomic::AtomicUsize, mpsc::sync_channel}, - }; + use std::cell::{Cell, UnsafeCell}; + use std::ffi::CString; + use std::sync::atomic::AtomicUsize; + use std::sync::mpsc::sync_channel; - use std::cell::UnsafeCell; use utils::id::TenantTimelineId; - use crate::{ - api_bindings::Level, - bindings::{NeonWALReadResult, PG_VERSION_NUM}, - walproposer::Wrapper, - }; - use super::ApiImpl; + use crate::api_bindings::Level; + use crate::bindings::{NeonWALReadResult, PG_VERSION_NUM}; + use crate::walproposer::Wrapper; #[derive(Clone, Copy, Debug)] struct WaitEventsData { @@ -496,57 +493,64 @@ mod tests { // Messages definitions are at walproposer.h // xxx: it would be better to extract them from safekeeper crate and // use serialization/deserialization here. - let greeting_tag = (b'g' as u64).to_ne_bytes(); - let proto_version = 2_u32.to_ne_bytes(); - let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes(); - let proposer_id = [0; 16]; - let system_id = 0_u64.to_ne_bytes(); - let tenant_id = ttid.tenant_id.as_arr(); - let timeline_id = ttid.timeline_id.as_arr(); - let pg_tli = 1_u32.to_ne_bytes(); - let wal_seg_size = 16777216_u32.to_ne_bytes(); + let greeting_tag = (b'g').to_be_bytes(); + let tenant_id = CString::new(ttid.tenant_id.to_string()) + .unwrap() + .into_bytes_with_nul(); + let timeline_id = CString::new(ttid.timeline_id.to_string()) + .unwrap() + .into_bytes_with_nul(); + let mconf_gen = 0_u32.to_be_bytes(); + let mconf_members_len = 0_u32.to_be_bytes(); + let mconf_members_new_len = 0_u32.to_be_bytes(); + let pg_version: [u8; 4] = PG_VERSION_NUM.to_be_bytes(); + let system_id = 0_u64.to_be_bytes(); + let wal_seg_size = 16777216_u32.to_be_bytes(); + let proposer_greeting = [ greeting_tag.as_slice(), - proto_version.as_slice(), - pg_version.as_slice(), - proposer_id.as_slice(), - system_id.as_slice(), tenant_id.as_slice(), timeline_id.as_slice(), - pg_tli.as_slice(), + mconf_gen.as_slice(), + mconf_members_len.as_slice(), + mconf_members_new_len.as_slice(), + pg_version.as_slice(), + system_id.as_slice(), wal_seg_size.as_slice(), ] .concat(); - let voting_tag = (b'v' as u64).to_ne_bytes(); - let vote_request_term = 3_u64.to_ne_bytes(); - let proposer_id = [0; 16]; + let voting_tag = (b'v').to_be_bytes(); + let vote_request_term = 3_u64.to_be_bytes(); let vote_request = [ voting_tag.as_slice(), + mconf_gen.as_slice(), vote_request_term.as_slice(), - proposer_id.as_slice(), ] .concat(); - let acceptor_greeting_term = 2_u64.to_ne_bytes(); - let acceptor_greeting_node_id = 1_u64.to_ne_bytes(); + let acceptor_greeting_term = 2_u64.to_be_bytes(); + let acceptor_greeting_node_id = 1_u64.to_be_bytes(); let acceptor_greeting = [ greeting_tag.as_slice(), - acceptor_greeting_term.as_slice(), acceptor_greeting_node_id.as_slice(), + mconf_gen.as_slice(), + mconf_members_len.as_slice(), + mconf_members_new_len.as_slice(), + acceptor_greeting_term.as_slice(), ] .concat(); - let vote_response_term = 3_u64.to_ne_bytes(); - let vote_given = 1_u64.to_ne_bytes(); - let flush_lsn = 0x539_u64.to_ne_bytes(); - let truncate_lsn = 0x539_u64.to_ne_bytes(); - let th_len = 1_u32.to_ne_bytes(); - let th_term = 2_u64.to_ne_bytes(); - let th_lsn = 0x539_u64.to_ne_bytes(); - let timeline_start_lsn = 0x539_u64.to_ne_bytes(); + let vote_response_term = 3_u64.to_be_bytes(); + let vote_given = 1_u8.to_be_bytes(); + let flush_lsn = 0x539_u64.to_be_bytes(); + let truncate_lsn = 0x539_u64.to_be_bytes(); + let th_len = 1_u32.to_be_bytes(); + let th_term = 2_u64.to_be_bytes(); + let th_lsn = 0x539_u64.to_be_bytes(); let vote_response = [ voting_tag.as_slice(), + mconf_gen.as_slice(), vote_response_term.as_slice(), vote_given.as_slice(), flush_lsn.as_slice(), @@ -554,7 +558,6 @@ mod tests { th_len.as_slice(), th_term.as_slice(), th_lsn.as_slice(), - timeline_start_lsn.as_slice(), ] .concat(); diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 9d4463d595..56d97bf8a9 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "pageserver" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] @@ -48,6 +48,7 @@ pprof.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true +rustls.workspace = true scopeguard.workspace = true send-future.workspace = true serde.workspace = true @@ -62,10 +63,12 @@ tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util" tokio-epoll-uring.workspace = true tokio-io-timeout.workspace = true tokio-postgres.workspace = true +tokio-rustls.workspace = true tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true +tracing-utils.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true @@ -98,6 +101,7 @@ criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } indoc.workspace = true +uuid.workspace = true [[bench]] name = "bench_layer_map" @@ -115,6 +119,10 @@ harness = false name = "upload_queue" harness = false +[[bench]] +name = "bench_metrics" +harness = false + [[bin]] name = "test_helper_slow_client_reads" required-features = [ "testing" ] diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index b67a9cc479..b1103948d6 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -1,22 +1,20 @@ -use std::{env, num::NonZeroUsize}; +use std::env; +use std::num::NonZeroUsize; use bytes::Bytes; use camino::Utf8PathBuf; -use criterion::{criterion_group, criterion_main, Criterion}; -use pageserver::{ - config::PageServerConf, - context::{DownloadBehavior, RequestContext}, - l0_flush::{L0FlushConfig, L0FlushGlobalState}, - page_cache, - task_mgr::TaskKind, - tenant::storage_layer::InMemoryLayer, - virtual_file, -}; -use pageserver_api::{key::Key, shard::TenantShardId, value::Value}; -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, -}; +use criterion::{Criterion, criterion_group, criterion_main}; +use pageserver::config::PageServerConf; +use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState}; +use pageserver::task_mgr::TaskKind; +use pageserver::tenant::storage_layer::InMemoryLayer; +use pageserver::{page_cache, virtual_file}; +use pageserver_api::key::Key; +use pageserver_api::shard::TenantShardId; +use pageserver_api::value::Value; +use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; use wal_decoder::serialized_batch::SerializedValueBatch; // A very cheap hash for generating non-sequential keys. diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5c5b52db44..e1444778b8 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,23 +1,20 @@ -use criterion::measurement::WallTime; -use pageserver::keyspace::{KeyPartitioning, KeySpace}; -use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::storage_layer::PersistentLayerDesc; -use pageserver_api::key::Key; -use pageserver_api::shard::TenantShardId; -use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::str::FromStr; use std::time::Instant; + +use criterion::measurement::WallTime; +use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main}; +use pageserver::tenant::layer_map::LayerMap; +use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc}; +use pageserver_api::key::Key; +use pageserver_api::shard::TenantShardId; +use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use utils::id::{TenantId, TimelineId}; - use utils::lsn::Lsn; -use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion}; - fn fixture_path(relative: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) } @@ -74,41 +71,6 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { .collect() } -// Construct a partitioning for testing get_difficulty map when we -// don't have an exact result of `collect_keyspace` to work with. -fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning { - let mut parts = Vec::new(); - - // We add a partition boundary at the start of each image layer, - // no matter what lsn range it covers. This is just the easiest - // thing to do. A better thing to do would be to get a real - // partitioning from some database. Even better, remove the need - // for key partitions by deciding where to create image layers - // directly based on a coverage-based difficulty map. - let mut keys: Vec<_> = layer_map - .iter_historic_layers() - .filter_map(|l| { - if l.is_incremental() { - None - } else { - let kr = l.get_key_range(); - Some(kr.start.next()) - } - }) - .collect(); - keys.sort(); - - let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap(); - for key in keys { - parts.push(KeySpace { - ranges: vec![current_key..key], - }); - current_key = key; - } - - KeyPartitioning { parts } -} - // Benchmark using metadata extracted from our performance test environment, from // a project where we have run pgbench many timmes. The pgbench database was initialized // between each test run. @@ -150,41 +112,6 @@ fn bench_from_real_project(c: &mut Criterion) { // Choose uniformly distributed queries let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); - // Choose inputs for get_difficulty_map - let latest_lsn = layer_map - .iter_historic_layers() - .map(|l| l.get_lsn_range().end) - .max() - .unwrap(); - let partitioning = uniform_key_partitioning(&layer_map, latest_lsn); - - // Check correctness of get_difficulty_map - // TODO put this in a dedicated test outside of this mod - { - println!("running correctness check"); - - let now = Instant::now(); - let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning); - assert!(result_bruteforce.len() == partitioning.parts.len()); - println!("Finished bruteforce in {:?}", now.elapsed()); - - let now = Instant::now(); - let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None); - assert!(result_fast.len() == partitioning.parts.len()); - println!("Finished fast in {:?}", now.elapsed()); - - // Assert results are equal. Manually iterate for easier debugging. - let zip = std::iter::zip( - &partitioning.parts, - std::iter::zip(result_bruteforce, result_fast), - ); - for (_part, (bruteforce, fast)) in zip { - assert_eq!(bruteforce, fast); - } - - println!("No issues found"); - } - // Define and name the benchmark function let mut group = c.benchmark_group("real_map"); group.bench_function("uniform_queries", |b| { @@ -194,11 +121,6 @@ fn bench_from_real_project(c: &mut Criterion) { } }); }); - group.bench_function("get_difficulty_map", |b| { - b.iter(|| { - layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3)); - }); - }); group.finish(); } diff --git a/pageserver/benches/bench_metrics.rs b/pageserver/benches/bench_metrics.rs new file mode 100644 index 0000000000..38025124e1 --- /dev/null +++ b/pageserver/benches/bench_metrics.rs @@ -0,0 +1,366 @@ +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use utils::id::{TenantId, TimelineId}; + +// +// Demonstrates that repeat label values lookup is a multicore scalability bottleneck +// that is worth avoiding. +// +criterion_group!( + label_values, + label_values::bench_naive_usage, + label_values::bench_cache_label_values_lookup +); +mod label_values { + use super::*; + + pub fn bench_naive_usage(c: &mut Criterion) { + let mut g = c.benchmark_group("label_values__naive_usage"); + + for ntimelines in [1, 4, 8] { + g.bench_with_input( + BenchmarkId::new("ntimelines", ntimelines), + &ntimelines, + |b, ntimelines| { + b.iter_custom(|iters| { + let barrier = std::sync::Barrier::new(*ntimelines + 1); + + let timelines = (0..*ntimelines) + .map(|_| { + ( + TenantId::generate().to_string(), + "0000".to_string(), + TimelineId::generate().to_string(), + ) + }) + .collect::>(); + + let metric_vec = metrics::UIntGaugeVec::new( + metrics::opts!("testmetric", "testhelp"), + &["tenant_id", "shard_id", "timeline_id"], + ) + .unwrap(); + + std::thread::scope(|s| { + for (tenant_id, shard_id, timeline_id) in &timelines { + s.spawn(|| { + barrier.wait(); + for _ in 0..iters { + metric_vec + .with_label_values(&[tenant_id, shard_id, timeline_id]) + .inc(); + } + barrier.wait(); + }); + } + barrier.wait(); + let start = std::time::Instant::now(); + barrier.wait(); + start.elapsed() + }) + }) + }, + ); + } + g.finish(); + } + + pub fn bench_cache_label_values_lookup(c: &mut Criterion) { + let mut g = c.benchmark_group("label_values__cache_label_values_lookup"); + + for ntimelines in [1, 4, 8] { + g.bench_with_input( + BenchmarkId::new("ntimelines", ntimelines), + &ntimelines, + |b, ntimelines| { + b.iter_custom(|iters| { + let barrier = std::sync::Barrier::new(*ntimelines + 1); + + let timelines = (0..*ntimelines) + .map(|_| { + ( + TenantId::generate().to_string(), + "0000".to_string(), + TimelineId::generate().to_string(), + ) + }) + .collect::>(); + + let metric_vec = metrics::UIntGaugeVec::new( + metrics::opts!("testmetric", "testhelp"), + &["tenant_id", "shard_id", "timeline_id"], + ) + .unwrap(); + + std::thread::scope(|s| { + for (tenant_id, shard_id, timeline_id) in &timelines { + s.spawn(|| { + let metric = metric_vec.with_label_values(&[ + tenant_id, + shard_id, + timeline_id, + ]); + barrier.wait(); + for _ in 0..iters { + metric.inc(); + } + barrier.wait(); + }); + } + barrier.wait(); + let start = std::time::Instant::now(); + barrier.wait(); + start.elapsed() + }) + }) + }, + ); + } + g.finish(); + } +} + +// +// Demonstrates that even a single metric can be a scalability bottleneck +// if multiple threads in it concurrently but there's nothing we can do +// about it without changing the metrics framework to use e.g. sharded counte atomics. +// +criterion_group!( + single_metric_multicore_scalability, + single_metric_multicore_scalability::bench, +); +mod single_metric_multicore_scalability { + use super::*; + + pub fn bench(c: &mut Criterion) { + let mut g = c.benchmark_group("single_metric_multicore_scalability"); + + for nthreads in [1, 4, 8] { + g.bench_with_input( + BenchmarkId::new("nthreads", nthreads), + &nthreads, + |b, nthreads| { + b.iter_custom(|iters| { + let barrier = std::sync::Barrier::new(*nthreads + 1); + + let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap(); + + std::thread::scope(|s| { + for _ in 0..*nthreads { + s.spawn(|| { + barrier.wait(); + for _ in 0..iters { + metric.inc(); + } + barrier.wait(); + }); + } + barrier.wait(); + let start = std::time::Instant::now(); + barrier.wait(); + start.elapsed() + }) + }) + }, + ); + } + g.finish(); + } +} + +// +// Demonstrates that even if we cache label value, the propagation of such a cached metric value +// by Clone'ing it is a scalability bottleneck. +// The reason is that it's an Arc internally and thus there's contention on the reference count atomics. +// +// We can avoid that by having long-lived references per thread (= indirection). +// +criterion_group!( + propagation_of_cached_label_value, + propagation_of_cached_label_value::bench_naive, + propagation_of_cached_label_value::bench_long_lived_reference_per_thread, +); +mod propagation_of_cached_label_value { + use std::sync::Arc; + + use super::*; + + pub fn bench_naive(c: &mut Criterion) { + let mut g = c.benchmark_group("propagation_of_cached_label_value__naive"); + + for nthreads in [1, 4, 8] { + g.bench_with_input( + BenchmarkId::new("nthreads", nthreads), + &nthreads, + |b, nthreads| { + b.iter_custom(|iters| { + let barrier = std::sync::Barrier::new(*nthreads + 1); + + let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap(); + + std::thread::scope(|s| { + for _ in 0..*nthreads { + s.spawn(|| { + barrier.wait(); + for _ in 0..iters { + // propagating the metric means we'd clone it into the child RequestContext + let propagated = metric.clone(); + // simulate some work + criterion::black_box(propagated); + } + barrier.wait(); + }); + } + barrier.wait(); + let start = std::time::Instant::now(); + barrier.wait(); + start.elapsed() + }) + }) + }, + ); + } + g.finish(); + } + + pub fn bench_long_lived_reference_per_thread(c: &mut Criterion) { + let mut g = + c.benchmark_group("propagation_of_cached_label_value__long_lived_reference_per_thread"); + + for nthreads in [1, 4, 8] { + g.bench_with_input( + BenchmarkId::new("nthreads", nthreads), + &nthreads, + |b, nthreads| { + b.iter_custom(|iters| { + let barrier = std::sync::Barrier::new(*nthreads + 1); + + let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap(); + + std::thread::scope(|s| { + for _ in 0..*nthreads { + s.spawn(|| { + // This is the technique. + let this_threads_metric_reference = Arc::new(metric.clone()); + + barrier.wait(); + for _ in 0..iters { + // propagating the metric means we'd clone it into the child RequestContext + let propagated = Arc::clone(&this_threads_metric_reference); + // simulate some work (include the pointer chase!) + criterion::black_box(&*propagated); + } + barrier.wait(); + }); + } + barrier.wait(); + let start = std::time::Instant::now(); + barrier.wait(); + start.elapsed() + }) + }) + }, + ); + } + } +} + +criterion_main!( + label_values, + single_metric_multicore_scalability, + propagation_of_cached_label_value +); + +/* +RUST_BACKTRACE=full cargo bench --bench bench_metrics -- --discard-baseline --noplot + +Results on an im4gn.2xlarge instance + +label_values__naive_usage/ntimelines/1 time: [178.71 ns 178.74 ns 178.76 ns] +label_values__naive_usage/ntimelines/4 time: [532.94 ns 539.59 ns 546.31 ns] +label_values__naive_usage/ntimelines/8 time: [1.1082 µs 1.1109 µs 1.1135 µs] +label_values__cache_label_values_lookup/ntimelines/1 time: [6.4116 ns 6.4119 ns 6.4123 ns] +label_values__cache_label_values_lookup/ntimelines/4 time: [6.3482 ns 6.3819 ns 6.4079 ns] +label_values__cache_label_values_lookup/ntimelines/8 time: [6.4213 ns 6.5279 ns 6.6293 ns] +single_metric_multicore_scalability/nthreads/1 time: [6.0102 ns 6.0104 ns 6.0106 ns] +single_metric_multicore_scalability/nthreads/4 time: [38.127 ns 38.275 ns 38.416 ns] +single_metric_multicore_scalability/nthreads/8 time: [73.698 ns 74.882 ns 75.864 ns] +propagation_of_cached_label_value__naive/nthreads/1 time: [14.424 ns 14.425 ns 14.426 ns] +propagation_of_cached_label_value__naive/nthreads/4 time: [100.71 ns 102.53 ns 104.35 ns] +propagation_of_cached_label_value__naive/nthreads/8 time: [211.50 ns 214.44 ns 216.87 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.135 ns 14.147 ns 14.160 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.243 ns 14.255 ns 14.268 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [14.470 ns 14.682 ns 14.895 ns] + +Results on an i3en.3xlarge instance + +label_values__naive_usage/ntimelines/1 time: [117.32 ns 117.53 ns 117.74 ns] +label_values__naive_usage/ntimelines/4 time: [736.58 ns 741.12 ns 745.61 ns] +label_values__naive_usage/ntimelines/8 time: [1.4513 µs 1.4596 µs 1.4665 µs] +label_values__cache_label_values_lookup/ntimelines/1 time: [8.0964 ns 8.0979 ns 8.0995 ns] +label_values__cache_label_values_lookup/ntimelines/4 time: [8.1620 ns 8.2912 ns 8.4491 ns] +label_values__cache_label_values_lookup/ntimelines/8 time: [14.148 ns 14.237 ns 14.324 ns] +single_metric_multicore_scalability/nthreads/1 time: [8.0993 ns 8.1013 ns 8.1046 ns] +single_metric_multicore_scalability/nthreads/4 time: [80.039 ns 80.672 ns 81.297 ns] +single_metric_multicore_scalability/nthreads/8 time: [153.58 ns 154.23 ns 154.90 ns] +propagation_of_cached_label_value__naive/nthreads/1 time: [13.924 ns 13.926 ns 13.928 ns] +propagation_of_cached_label_value__naive/nthreads/4 time: [143.66 ns 145.27 ns 146.59 ns] +propagation_of_cached_label_value__naive/nthreads/8 time: [296.51 ns 297.90 ns 299.30 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.013 ns 14.149 ns 14.308 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.311 ns 14.625 ns 14.984 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [25.981 ns 26.227 ns 26.476 ns] + +Results on an Standard L16s v3 (16 vcpus, 128 GiB memory) Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz + +label_values__naive_usage/ntimelines/1 time: [101.63 ns 101.84 ns 102.06 ns] +label_values__naive_usage/ntimelines/4 time: [417.55 ns 424.73 ns 432.63 ns] +label_values__naive_usage/ntimelines/8 time: [874.91 ns 889.51 ns 904.25 ns] +label_values__cache_label_values_lookup/ntimelines/1 time: [5.7724 ns 5.7760 ns 5.7804 ns] +label_values__cache_label_values_lookup/ntimelines/4 time: [7.8878 ns 7.9401 ns 8.0034 ns] +label_values__cache_label_values_lookup/ntimelines/8 time: [7.2621 ns 7.6354 ns 8.0337 ns] +single_metric_multicore_scalability/nthreads/1 time: [5.7710 ns 5.7744 ns 5.7785 ns] +single_metric_multicore_scalability/nthreads/4 time: [66.629 ns 66.994 ns 67.336 ns] +single_metric_multicore_scalability/nthreads/8 time: [130.85 ns 131.98 ns 132.91 ns] +propagation_of_cached_label_value__naive/nthreads/1 time: [11.540 ns 11.546 ns 11.553 ns] +propagation_of_cached_label_value__naive/nthreads/4 time: [131.22 ns 131.90 ns 132.56 ns] +propagation_of_cached_label_value__naive/nthreads/8 time: [260.99 ns 262.75 ns 264.26 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [11.544 ns 11.550 ns 11.557 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [11.568 ns 11.642 ns 11.763 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [13.416 ns 14.121 ns 14.886 ns + +Results on an M4 MAX MacBook Pro Total Number of Cores: 14 (10 performance and 4 efficiency) + +label_values__naive_usage/ntimelines/1 time: [52.711 ns 53.026 ns 53.381 ns] +label_values__naive_usage/ntimelines/4 time: [323.99 ns 330.40 ns 337.53 ns] +label_values__naive_usage/ntimelines/8 time: [1.1615 µs 1.1998 µs 1.2399 µs] +label_values__cache_label_values_lookup/ntimelines/1 time: [1.6635 ns 1.6715 ns 1.6809 ns] +label_values__cache_label_values_lookup/ntimelines/4 time: [1.7786 ns 1.7876 ns 1.8028 ns] +label_values__cache_label_values_lookup/ntimelines/8 time: [1.8195 ns 1.8371 ns 1.8665 ns] +single_metric_multicore_scalability/nthreads/1 time: [1.7764 ns 1.7909 ns 1.8079 ns] +single_metric_multicore_scalability/nthreads/4 time: [33.875 ns 34.868 ns 35.923 ns] +single_metric_multicore_scalability/nthreads/8 time: [226.85 ns 235.30 ns 244.18 ns] +propagation_of_cached_label_value__naive/nthreads/1 time: [3.4337 ns 3.4491 ns 3.4660 ns] +propagation_of_cached_label_value__naive/nthreads/4 time: [69.486 ns 71.937 ns 74.472 ns] +propagation_of_cached_label_value__naive/nthreads/8 time: [434.87 ns 456.47 ns 477.84 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [3.3767 ns 3.3974 ns 3.4220 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [3.6105 ns 4.2355 ns 5.1463 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [4.0889 ns 4.9714 ns 6.0779 ns] + +Results on a Hetzner AX102 AMD Ryzen 9 7950X3D 16-Core Processor + +label_values__naive_usage/ntimelines/1 time: [64.510 ns 64.559 ns 64.610 ns] +label_values__naive_usage/ntimelines/4 time: [309.71 ns 326.09 ns 342.32 ns] +label_values__naive_usage/ntimelines/8 time: [776.92 ns 819.35 ns 856.93 ns] +label_values__cache_label_values_lookup/ntimelines/1 time: [1.2855 ns 1.2943 ns 1.3021 ns] +label_values__cache_label_values_lookup/ntimelines/4 time: [1.3865 ns 1.4139 ns 1.4441 ns] +label_values__cache_label_values_lookup/ntimelines/8 time: [1.5311 ns 1.5669 ns 1.6046 ns] +single_metric_multicore_scalability/nthreads/1 time: [1.1927 ns 1.1981 ns 1.2049 ns] +single_metric_multicore_scalability/nthreads/4 time: [24.346 ns 25.439 ns 26.634 ns] +single_metric_multicore_scalability/nthreads/8 time: [58.666 ns 60.137 ns 61.486 ns] +propagation_of_cached_label_value__naive/nthreads/1 time: [2.7067 ns 2.7238 ns 2.7402 ns] +propagation_of_cached_label_value__naive/nthreads/4 time: [62.723 ns 66.214 ns 69.787 ns] +propagation_of_cached_label_value__naive/nthreads/8 time: [164.24 ns 170.10 ns 175.68 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [2.2915 ns 2.2960 ns 2.3012 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [2.5726 ns 2.6158 ns 2.6624 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [2.7068 ns 2.8243 ns 2.9824 ns] + +*/ diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index d3551b56e1..77b3f90b3e 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -56,20 +56,23 @@ //! medium/128 time: [10.412 ms 10.574 ms 10.718 ms] //! ``` +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use once_cell::sync::Lazy; -use pageserver::{config::PageServerConf, walredo::PostgresRedoManager}; +use pageserver::config::PageServerConf; +use pageserver::walredo::PostgresRedoManager; +use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; -use pageserver_api::{key::Key, shard::TenantShardId}; -use std::{ - future::Future, - sync::Arc, - time::{Duration, Instant}, -}; -use tokio::{sync::Barrier, task::JoinSet}; -use utils::{id::TenantId, lsn::Lsn}; +use pageserver_api::shard::TenantShardId; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use utils::id::TenantId; +use utils::lsn::Lsn; fn bench(c: &mut Criterion) { macro_rules! bench_group { diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs index ed5daa8ae1..8de06a6c25 100644 --- a/pageserver/benches/upload_queue.rs +++ b/pageserver/benches/upload_queue.rs @@ -1,15 +1,15 @@ //! Upload queue benchmarks. use std::str::FromStr as _; -use std::sync::atomic::AtomicU32; use std::sync::Arc; +use std::sync::atomic::AtomicU32; -use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use criterion::{Bencher, Criterion, criterion_group, criterion_main}; +use pageserver::tenant::IndexPart; use pageserver::tenant::metadata::TimelineMetadata; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask}; -use pageserver::tenant::IndexPart; use pprof::criterion::{Output, PProfProfiler}; use utils::generation::Generation; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index bb0f64ca32..508dac231e 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,17 +1,15 @@ -use std::{collections::HashMap, error::Error as _}; +use std::collections::HashMap; +use std::error::Error as _; use bytes::Bytes; -use reqwest::{IntoUrl, Method, StatusCode}; - use detach_ancestor::AncestorDetached; use http_utils::error::HttpErrorBody; -use pageserver_api::{models::*, shard::TenantShardId}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - +use pageserver_api::models::*; +use pageserver_api::shard::TenantShardId; pub use reqwest::Body as ReqwestBody; +use reqwest::{Certificate, IntoUrl, Method, StatusCode, Url}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::BlockUnblock; @@ -40,6 +38,9 @@ pub enum Error { #[error("Cancelled")] Cancelled, + + #[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] + CreateClient(reqwest::Error), } pub type Result = std::result::Result; @@ -71,8 +72,17 @@ pub enum ForceAwaitLogicalSize { } impl Client { - pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { - Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt) + pub fn new( + mgmt_api_endpoint: String, + jwt: Option<&str>, + ssl_ca_cert: Option, + ) -> Result { + let mut http_client = reqwest::Client::builder(); + if let Some(ssl_ca_cert) = ssl_ca_cert { + http_client = http_client.add_root_certificate(ssl_ca_cert); + } + let http_client = http_client.build().map_err(Error::CreateClient)?; + Ok(Self::from_client(http_client, mgmt_api_endpoint, jwt)) } pub fn from_client( @@ -103,12 +113,10 @@ impl Client { debug_assert!(path.starts_with('/')); let uri = format!("{}{}", self.mgmt_api_endpoint, path); - let req = self.client.request(Method::GET, uri); - let req = if let Some(value) = &self.authorization_header { - req.header(reqwest::header::AUTHORIZATION, value) - } else { - req - }; + let mut req = self.client.request(Method::GET, uri); + if let Some(value) = &self.authorization_header { + req = req.header(reqwest::header::AUTHORIZATION, value); + } req.send().await.map_err(Error::ReceiveBody) } @@ -450,13 +458,21 @@ impl Client { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + behavior: Option, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor", self.mgmt_api_endpoint ); + let mut uri = Url::parse(&uri) + .map_err(|e| Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")))?; - self.request(Method::PUT, &uri, ()) + if let Some(behavior) = behavior { + uri.query_pairs_mut() + .append_pair("detach_behavior", &behavior.to_string()); + } + + self.request(Method::PUT, uri, ()) .await? .json() .await @@ -482,6 +498,7 @@ impl Client { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<()> { let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers", @@ -489,6 +506,9 @@ impl Client { )) .expect("Cannot build URL"); + path.query_pairs_mut() + .append_pair("recurse", &format!("{}", recurse)); + if let Some(concurrency) = concurrency { path.query_pairs_mut() .append_pair("concurrency", &format!("{}", concurrency)); diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 47da83b0eb..ef35ac2f48 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -1,23 +1,16 @@ use std::sync::{Arc, Mutex}; -use futures::{ - stream::{SplitSink, SplitStream}, - SinkExt, StreamExt, -}; -use pageserver_api::{ - models::{ - PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, - PagestreamGetPageResponse, - }, - reltag::RelTag, +use futures::stream::{SplitSink, SplitStream}; +use futures::{SinkExt, StreamExt}; +use pageserver_api::models::{ + PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, }; +use pageserver_api::reltag::RelTag; use tokio::task::JoinHandle; use tokio_postgres::CopyOutStream; use tokio_util::sync::CancellationToken; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; pub struct Client { client: tokio_postgres::Client, diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs index c308694ae1..dd35417333 100644 --- a/pageserver/compaction/src/bin/compaction-simulator.rs +++ b/pageserver/compaction/src/bin/compaction-simulator.rs @@ -1,11 +1,11 @@ -use clap::{Parser, Subcommand}; -use pageserver_compaction::helpers::PAGE_SZ; -use pageserver_compaction::simulator::MockTimeline; -use rand::Rng; use std::io::Write; use std::path::{Path, PathBuf}; use std::sync::OnceLock; +use clap::{Parser, Subcommand}; +use pageserver_compaction::helpers::PAGE_SZ; +use pageserver_compaction::simulator::MockTimeline; +use rand::Rng; use utils::project_git_version; project_git_version!(GIT_VERSION); @@ -157,8 +157,9 @@ async fn run_suite() -> anyhow::Result<()> { use std::fs::File; use std::io::Stdout; use std::sync::Mutex; -use tracing_subscriber::fmt::writer::EitherWriter; + use tracing_subscriber::fmt::MakeWriter; +use tracing_subscriber::fmt::writer::EitherWriter; static LOG_FILE: OnceLock>> = OnceLock::new(); fn get_log_output() -> &'static Mutex> { diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 7779ffaf8b..75f43d7ff7 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -17,20 +17,19 @@ //! distance of image layers in LSN dimension is roughly equal to the logical //! database size. For example, if the logical database size is 10 GB, we would //! generate new image layers every 10 GB of WAL. -use futures::StreamExt; -use pageserver_api::shard::ShardIdentity; -use tracing::{debug, info}; - use std::collections::{HashSet, VecDeque}; use std::ops::Range; -use crate::helpers::{ - accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ, -}; -use crate::interface::*; +use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; +use tracing::{debug, info}; use utils::lsn::Lsn; +use crate::helpers::{ + PAGE_SZ, accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, +}; use crate::identify_levels::identify_level; +use crate::interface::*; /// Main entry point to compaction. /// @@ -307,7 +306,7 @@ where let mut layer_ids: Vec = Vec::new(); for layer_id in &job.input_layers { let layer = &self.layers[layer_id.0].layer; - if let Some(dl) = self.executor.downcast_delta_layer(layer).await? { + if let Some(dl) = self.executor.downcast_delta_layer(layer, ctx).await? { deltas.push(dl.clone()); layer_ids.push(*layer_id); } @@ -536,15 +535,16 @@ where let mut deltas: Vec = Vec::new(); for layer_id in &job.input_layers { let l = &self.layers[layer_id.0]; - if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? { + if let Some(dl) = self.executor.downcast_delta_layer(&l.layer, ctx).await? { deltas.push(dl.clone()); } } // Open stream - let key_value_stream = - std::pin::pin!(merge_delta_keys_buffered::(deltas.as_slice(), ctx) + let key_value_stream = std::pin::pin!( + merge_delta_keys_buffered::(deltas.as_slice(), ctx) .await? - .map(Result::<_, anyhow::Error>::Ok)); + .map(Result::<_, anyhow::Error>::Ok) + ); let mut new_jobs = Vec::new(); // Slide a window through the keyspace diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 6b739d85a7..421802eef3 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -1,21 +1,21 @@ //! This file contains generic utility functions over the interface types, //! which could be handy for any compaction implementation. -use crate::interface::*; +use std::collections::{BinaryHeap, VecDeque}; +use std::fmt::Display; +use std::future::Future; +use std::ops::{DerefMut, Range}; +use std::pin::Pin; +use std::task::{Poll, ready}; use futures::future::BoxFuture; use futures::{Stream, StreamExt}; use itertools::Itertools; use pageserver_api::shard::ShardIdentity; use pin_project_lite::pin_project; -use std::collections::BinaryHeap; -use std::collections::VecDeque; -use std::fmt::Display; -use std::future::Future; -use std::ops::{DerefMut, Range}; -use std::pin::Pin; -use std::task::{ready, Poll}; use utils::lsn::Lsn; +use crate::interface::*; + pub const PAGE_SZ: u64 = 8192; pub fn keyspace_total_size( @@ -221,12 +221,12 @@ where // performed implicitly when `top` is dropped). if let Some(mut top) = this.heap.peek_mut() { match top.deref_mut() { - LazyLoadLayer::Unloaded(ref mut l) => { + LazyLoadLayer::Unloaded(l) => { let fut = l.load_keys(this.ctx); this.load_future.set(Some(Box::pin(fut))); continue; } - LazyLoadLayer::Loaded(ref mut entries) => { + LazyLoadLayer::Loaded(entries) => { let result = entries.pop_front().unwrap(); if entries.is_empty() { std::collections::binary_heap::PeekMut::pop(top); diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs index e04bd15396..61575e3992 100644 --- a/pageserver/compaction/src/identify_levels.rs +++ b/pageserver/compaction/src/identify_levels.rs @@ -26,15 +26,15 @@ //! file size, the file will still be considered to be part of L0 at the next //! iteration. -use anyhow::bail; use std::collections::BTreeSet; use std::ops::Range; + +use anyhow::bail; +use tracing::{info, trace}; use utils::lsn::Lsn; use crate::interface::*; -use tracing::{info, trace}; - pub struct Level { pub lsn_range: Range, pub layers: Vec, @@ -60,7 +60,11 @@ where if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn { // shouldn't happen. Indicates that the caller passed a bogus // end_lsn. - bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id()); + bail!( + "identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", + end_lsn, + l.short_id() + ); } // include image layers sitting exacty at `end_lsn`. let is_image = !l.is_delta(); @@ -246,9 +250,10 @@ impl Level { #[cfg(test)] mod tests { + use std::sync::{Arc, Mutex}; + use super::*; use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer}; - use std::sync::{Arc, Mutex}; fn delta(key_range: Range, lsn_range: Range) -> MockLayer { MockLayer::Delta(Arc::new(MockDeltaLayer { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 8ed393a645..63fbc565cc 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -3,9 +3,12 @@ //! //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. -use futures::Future; -use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity}; use std::ops::Range; + +use futures::Future; +use pageserver_api::key::Key; +use pageserver_api::keyspace::ShardedRange; +use pageserver_api::shard::ShardIdentity; use utils::lsn::Lsn; /// Public interface. This is the main thing that the implementor needs to provide @@ -55,6 +58,7 @@ pub trait CompactionJobExecutor { fn downcast_delta_layer( &self, layer: &Self::Layer, + ctx: &Self::RequestContext, ) -> impl Future>> + Send; // ---- diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index 673b80c313..bf9f6f2658 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -1,22 +1,17 @@ mod draw; -use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; +use std::fmt::Write; +use std::ops::Range; +use std::sync::{Arc, Mutex}; +use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; use futures::StreamExt; use pageserver_api::shard::ShardIdentity; use rand::Rng; use tracing::info; - use utils::lsn::Lsn; -use std::fmt::Write; -use std::ops::Range; -use std::sync::Arc; -use std::sync::Mutex; - -use crate::helpers::PAGE_SZ; -use crate::helpers::{merge_delta_keys, overlaps_with}; - +use crate::helpers::{PAGE_SZ, merge_delta_keys, overlaps_with}; use crate::interface; use crate::interface::CompactionLayer; @@ -487,6 +482,7 @@ impl interface::CompactionJobExecutor for MockTimeline { async fn downcast_delta_layer( &self, layer: &MockLayer, + _ctx: &MockRequestContext, ) -> anyhow::Result>> { Ok(match layer { MockLayer::Delta(l) => Some(l.clone()), diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs index 4559db09f1..3d35d1b91e 100644 --- a/pageserver/compaction/src/simulator/draw.rs +++ b/pageserver/compaction/src/simulator/draw.rs @@ -1,14 +1,14 @@ -use super::Key; -use anyhow::Result; use std::cmp::Ordering; -use std::{ - collections::{BTreeMap, BTreeSet, HashSet}, - fmt::Write, - ops::Range, -}; -use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style}; +use std::collections::{BTreeMap, BTreeSet, HashSet}; +use std::fmt::Write; +use std::ops::Range; + +use anyhow::Result; +use svg_fmt::{BeginSvg, EndSvg, Fill, Stroke, Style, rgb}; use utils::lsn::Lsn; +use super::Key; + // Map values to their compressed coordinate - the index the value // would have in a sorted and deduplicated list of all values. struct CoordinateMap { diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs index bd8b54a286..565f66ce1a 100644 --- a/pageserver/compaction/tests/tests.rs +++ b/pageserver/compaction/tests/tests.rs @@ -12,7 +12,7 @@ pub(crate) fn setup_logging() { logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, logging::Output::Stdout, ) - .expect("Failed to init test logging") + .expect("Failed to init test logging"); }); } diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 177e65ef79..80ca414543 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -50,18 +50,18 @@ //! ``` //! -use anyhow::{Context, Result}; -use pageserver_api::key::Key; use std::cmp::Ordering; +use std::collections::{BTreeMap, BTreeSet}; use std::io::{self, BufRead}; +use std::ops::Range; use std::path::PathBuf; use std::str::FromStr; -use std::{ - collections::{BTreeMap, BTreeSet}, - ops::Range, -}; -use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke}; -use utils::{lsn::Lsn, project_git_version}; + +use anyhow::{Context, Result}; +use pageserver_api::key::Key; +use svg_fmt::{BeginSvg, EndSvg, Fill, Stroke, rectangle, rgb}; +use utils::lsn::Lsn; +use utils::project_git_version; project_git_version!(GIT_VERSION); diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs index c7f0719c41..600f7c412e 100644 --- a/pageserver/ctl/src/key.rs +++ b/pageserver/ctl/src/key.rs @@ -1,11 +1,10 @@ +use std::str::FromStr; + use anyhow::Context; use clap::Parser; -use pageserver_api::{ - key::Key, - reltag::{BlockNumber, RelTag, SlruKind}, - shard::{ShardCount, ShardStripeSize}, -}; -use std::str::FromStr; +use pageserver_api::key::Key; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; +use pageserver_api::shard::{ShardCount, ShardStripeSize}; #[derive(Parser)] pub(super) struct DescribeKeyCommand { @@ -394,7 +393,10 @@ mod tests { fn single_positional_spanalike_is_key_material() { // why is this needed? if you are checking many, then copypaste starts to appeal let strings = [ - (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"), + ( + line!(), + "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0", + ), (line!(), "rel=1663/208101/2620_fsm blkno=2"), (line!(), "rel=1663/208101/2620.1 blkno=2"), ]; @@ -420,7 +422,15 @@ mod tests { #[test] fn multiple_spanlike_args() { let strings = [ - (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]), + ( + line!(), + &[ + "process_query{tenant_id=C", + "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", + "blkno=2", + "req_lsn=0/238D98C8}", + ][..], + ), (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]), (line!(), &["1663/208101/2620_fsm", "2"][..]), ]; diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 2c350d6d86..b426f977cf 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -2,27 +2,27 @@ //! //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data. -use anyhow::{anyhow, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use pageserver::context::{DownloadBehavior, RequestContext}; -use pageserver::task_mgr::TaskKind; -use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; -use pageserver::virtual_file::api::IoMode; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::Range; use std::str::FromStr; use std::{fs, str}; +use anyhow::{Result, anyhow}; +use camino::{Utf8Path, Utf8PathBuf}; +use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::page_cache::{self, PAGE_SZ}; +use pageserver::task_mgr::TaskKind; use pageserver::tenant::block_io::FileBlockReader; use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; -use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE}; -use pageserver::tenant::storage_layer::{range_overlaps, LayerName}; +use pageserver::tenant::storage_layer::delta_layer::{DELTA_KEY_SIZE, Summary}; +use pageserver::tenant::storage_layer::{LayerName, range_overlaps}; +use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; +use pageserver::virtual_file::api::IoMode; use pageserver::virtual_file::{self, VirtualFile}; -use pageserver_api::key::{Key, KEY_SIZE}; - -use utils::{bin_ser::BeSer, lsn::Lsn}; +use pageserver_api::key::{KEY_SIZE, Key}; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; use crate::AnalyzeLayerMapCmd; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 4c2c3ab30e..05fb35ff09 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -1,3 +1,4 @@ +use std::fs::{self, File}; use std::path::{Path, PathBuf}; use anyhow::Result; @@ -5,12 +6,10 @@ use camino::{Utf8Path, Utf8PathBuf}; use clap::Subcommand; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::task_mgr::TaskKind; -use pageserver::tenant::storage_layer::{delta_layer, image_layer}; -use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer}; +use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, image_layer}; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; -use std::fs::{self, File}; use utils::id::{TenantId, TimelineId}; use crate::layer_map_analyzer::parse_filename; diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 353b4bd2f9..72a120a69b 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -11,33 +11,29 @@ mod layer_map_analyzer; mod layers; mod page_trace; -use page_trace::PageTraceCmd; -use std::{ - str::FromStr, - time::{Duration, SystemTime}, -}; +use std::str::FromStr; +use std::time::{Duration, SystemTime}; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; use layers::LayerCmd; -use pageserver::{ - context::{DownloadBehavior, RequestContext}, - page_cache, - task_mgr::TaskKind, - tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, - virtual_file::{self, api::IoMode}, -}; +use page_trace::PageTraceCmd; +use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::page_cache; +use pageserver::task_mgr::TaskKind; +use pageserver::tenant::dump_layerfile_from_path; +use pageserver::tenant::metadata::TimelineMetadata; +use pageserver::virtual_file::api::IoMode; +use pageserver::virtual_file::{self}; use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; use remote_storage::{RemotePath, RemoteStorageConfig}; use tokio_util::sync::CancellationToken; -use utils::{ - id::TimelineId, - logging::{self, LogFormat, TracingErrorLayerEnablement}, - lsn::Lsn, - project_git_version, -}; +use utils::id::TimelineId; +use utils::logging::{self, LogFormat, TracingErrorLayerEnablement}; +use utils::lsn::Lsn; +use utils::project_git_version; project_git_version!(GIT_VERSION); @@ -355,7 +351,9 @@ mod tests { assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); - assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683"); + assert_valid( + "pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683", + ); assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); } } diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml index 245d293e4f..5b5ed09a2b 100644 --- a/pageserver/pagebench/Cargo.toml +++ b/pageserver/pagebench/Cargo.toml @@ -15,6 +15,7 @@ hdrhistogram.workspace = true humantime.workspace = true humantime-serde.workspace = true rand.workspace = true +reqwest.workspace=true serde.workspace = true serde_json.workspace = true tracing.workspace = true diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs index b869a0c6c7..394a954c30 100644 --- a/pageserver/pagebench/src/cmd/aux_files.rs +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -1,12 +1,12 @@ +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Instant; + use pageserver_api::models::{TenantConfig, TenantConfigRequest}; use pageserver_api::shard::TenantShardId; use utils::id::TenantTimelineId; use utils::lsn::Lsn; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::Instant; - /// Ingest aux files into the pageserver. #[derive(clap::Parser)] pub(crate) struct Args { @@ -36,7 +36,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - )); + None, // TODO: support ssl_ca_file for https APIs in pagebench. + )?); // discover targets let timelines: Vec = crate::util::cli::targets::discover( diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 3ae6d99aa7..d3013ded70 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -1,16 +1,3 @@ -use anyhow::Context; -use pageserver_api::shard::TenantShardId; -use pageserver_client::mgmt_api::ForceAwaitLogicalSize; -use pageserver_client::page_service::BasebackupRequest; - -use utils::id::TenantTimelineId; -use utils::lsn::Lsn; - -use rand::prelude::*; -use tokio::sync::Barrier; -use tokio::task::JoinSet; -use tracing::{info, instrument}; - use std::collections::HashMap; use std::num::NonZeroUsize; use std::ops::Range; @@ -18,6 +5,17 @@ use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Instant; +use anyhow::Context; +use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api::ForceAwaitLogicalSize; +use pageserver_client::page_service::BasebackupRequest; +use rand::prelude::*; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use tracing::{info, instrument}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; @@ -79,7 +77,8 @@ async fn main_impl( let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - )); + None, // TODO: support ssl_ca_file for https APIs in pagebench. + )?); // discover targets let timelines: Vec = crate::util::cli::targets::discover( diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index a60efc7567..969cf24b93 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -1,18 +1,3 @@ -use anyhow::Context; -use camino::Utf8PathBuf; -use pageserver_api::key::Key; -use pageserver_api::keyspace::KeySpaceAccum; -use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; - -use pageserver_api::shard::TenantShardId; -use tokio_util::sync::CancellationToken; -use utils::id::TenantTimelineId; -use utils::lsn::Lsn; - -use rand::prelude::*; -use tokio::task::JoinSet; -use tracing::info; - use std::collections::{HashSet, VecDeque}; use std::future::Future; use std::num::NonZeroUsize; @@ -21,6 +6,19 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; +use anyhow::Context; +use camino::Utf8PathBuf; +use pageserver_api::key::Key; +use pageserver_api::keyspace::KeySpaceAccum; +use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; +use pageserver_api::shard::TenantShardId; +use rand::prelude::*; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::info; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; @@ -127,7 +125,8 @@ async fn main_impl( let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - )); + None, // TODO: support ssl_ca_file for https APIs in pagebench. + )?); if let Some(engine_str) = &args.set_io_engine { mgmt_api_client.put_io_engine(engine_str).await?; diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs index 1bb71b9353..a77d3000cc 100644 --- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs +++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs @@ -1,23 +1,19 @@ -use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId}; +use std::f64; +use std::num::NonZeroUsize; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::{Duration, Instant}; +use pageserver_api::models::HistoricLayerInfo; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use rand::seq::SliceRandom; +use tokio::sync::{OwnedSemaphorePermit, mpsc}; +use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{debug, info}; use utils::id::{TenantTimelineId, TimelineId}; -use std::{f64, sync::Arc}; -use tokio::{ - sync::{mpsc, OwnedSemaphorePermit}, - task::JoinSet, -}; - -use std::{ - num::NonZeroUsize, - sync::atomic::{AtomicU64, Ordering}, - time::{Duration, Instant}, -}; - /// Evict & on-demand download random layers. #[derive(clap::Parser)] pub(crate) struct Args { @@ -87,7 +83,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - )); + None, // TODO: support ssl_ca_file for https APIs in pagebench. + )?); if let Some(engine_str) = &args.set_io_engine { mgmt_api_client.put_io_engine(engine_str).await?; diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs index f07beeecfd..2f919ec652 100644 --- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -2,11 +2,10 @@ use std::sync::Arc; use humantime::Duration; use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use tokio::task::JoinSet; use utils::id::TenantTimelineId; -use pageserver_client::mgmt_api::ForceAwaitLogicalSize; - #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] @@ -41,7 +40,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - )); + None, // TODO: support ssl_ca_file for https APIs in pagebench. + )?); // discover targets let timelines: Vec = crate::util::cli::targets::discover( diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs index 4aa6950782..ebe7bc031d 100644 --- a/pageserver/pagebench/src/util/request_stats.rs +++ b/pageserver/pagebench/src/util/request_stats.rs @@ -40,9 +40,7 @@ impl Stats { } } pub(crate) fn add(&mut self, other: &Self) { - let Self { - ref mut latency_histo, - } = self; + let Self { latency_histo } = self; latency_histo.add(&other.latency_histo).unwrap(); } } diff --git a/pageserver/src/assert_u64_eq_usize.rs b/pageserver/src/assert_u64_eq_usize.rs index 66ca7fd057..c4b8d9acba 100644 --- a/pageserver/src/assert_u64_eq_usize.rs +++ b/pageserver/src/assert_u64_eq_usize.rs @@ -2,7 +2,9 @@ pub(crate) const _ASSERT_U64_EQ_USIZE: () = { if std::mem::size_of::() != std::mem::size_of::() { - panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information"); + panic!( + "the traits defined in this module assume that usize and u64 can be converted to each other without loss of information" + ); } }; diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index 5cc20a70b2..b76c0e045f 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use ::metrics::IntGauge; use bytes::{Buf, BufMut, Bytes}; -use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; +use pageserver_api::key::{AUX_KEY_PREFIX, Key, METADATA_KEY_SIZE}; use tracing::warn; // BEGIN Copyright (c) 2017 Servo Contributors diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 99b0775316..de527e307b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,33 +10,32 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, Context}; -use bytes::{BufMut, Bytes, BytesMut}; -use fail::fail_point; -use pageserver_api::key::{rel_block_to_key, Key}; -use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::{Instant, SystemTime}; + +use anyhow::{Context, anyhow}; +use bytes::{BufMut, Bytes, BytesMut}; +use fail::fail_point; +use pageserver_api::key::{Key, rel_block_to_key}; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::pg_constants::{ + DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID, PG_HBA, PGDATA_SPECIAL_FILES, +}; +use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; +use postgres_ffi::{ + BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants, +}; use tokio::io; use tokio::io::AsyncWrite; -use tracing::*; - use tokio_tar::{Builder, EntryType, Header}; +use tracing::*; +use utils::lsn::Lsn; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; use crate::tenant::storage_layer::IoConcurrency; -use crate::tenant::Timeline; -use pageserver_api::reltag::{RelTag, SlruKind}; - -use postgres_ffi::dispatch_pgversion; -use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; -use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA}; -use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; -use postgres_ffi::XLogFileName; -use postgres_ffi::PG_TLI; -use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; -use utils::lsn::Lsn; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::{PageReconstructError, Timeline}; #[derive(Debug, thiserror::Error)] pub enum BasebackupError { @@ -44,6 +43,26 @@ pub enum BasebackupError { Server(#[from] anyhow::Error), #[error("basebackup client error {0:#} when {1}")] Client(#[source] io::Error, &'static str), + #[error("basebackup during shutdown")] + Shutdown, +} + +impl From for BasebackupError { + fn from(value: PageReconstructError) -> Self { + match value { + PageReconstructError::Cancelled => BasebackupError::Shutdown, + err => BasebackupError::Server(err.into()), + } + } +} + +impl From for BasebackupError { + fn from(value: GetVectoredError) -> Self { + match value { + GetVectoredError::Cancelled => BasebackupError::Shutdown, + err => BasebackupError::Server(err.into()), + } + } } /// Create basebackup with non-rel data in it. @@ -129,7 +148,7 @@ where timeline .gate .enter() - .map_err(|e| BasebackupError::Server(e.into()))?, + .map_err(|_| BasebackupError::Shutdown)?, ), }; basebackup @@ -325,8 +344,7 @@ where let slru_partitions = self .timeline .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + .await? .partition( self.timeline.get_shard_identity(), Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, @@ -338,11 +356,10 @@ where let blocks = self .timeline .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; for (key, block) in blocks { - let block = block.map_err(|e| BasebackupError::Server(e.into()))?; + let block = block?; slru_builder.add_block(&key, block).await?; } } @@ -351,11 +368,8 @@ where let mut min_restart_lsn: Lsn = Lsn::MAX; // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in self - .timeline - .list_dbdirs(self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + for ((spcnode, dbnode), has_relmap_file) in + self.timeline.list_dbdirs(self.lsn, self.ctx).await? { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; @@ -364,8 +378,7 @@ where let rels = self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty // contents of UNLOGGED relations. Postgres copies it in @@ -393,8 +406,7 @@ where let aux_files = self .timeline .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone()) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; let aux_scan_time = start_time.elapsed(); let aux_estimated_size = aux_files .values() @@ -453,16 +465,14 @@ where for xid in self .timeline .list_twophase_files(self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + .await? { self.add_twophase_file(xid).await?; } let repl_origins = self .timeline .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone()) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; let n_origins = repl_origins.len(); if n_origins != 0 { // @@ -507,8 +517,7 @@ where let nblocks = self .timeline .get_rel_size(src, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; // If the relation is empty, create an empty file if nblocks == 0 { @@ -534,8 +543,7 @@ where // TODO: investigate using get_vectored for the entire startblk..endblk range. // But this code path is not on the critical path for most basebackups (?). .get(rel_block_to_key(src, blknum), self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; segment_data.extend_from_slice(&img[..]); } @@ -569,8 +577,7 @@ where let img = self .timeline .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; if img.len() != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE) @@ -624,8 +631,7 @@ where && self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + .await? .is_empty() { return Ok(()); @@ -676,8 +682,7 @@ where let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index e2b9a7f073..3ab6d79546 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -3,49 +3,42 @@ //! Main entry point for the Page Server executable. use std::env; -use std::env::{var, VarError}; +use std::env::{VarError, var}; use std::io::Read; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; - -use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; -use pageserver::config::PageserverIdentity; +use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; +use metrics::set_build_info_metric; +use nix::sys::socket::{setsockopt, sockopt}; +use pageserver::config::{PageServerConf, PageserverIdentity}; use pageserver::controller_upcall_client::ControllerUpcallClient; +use pageserver::deletion_queue::DeletionQueue; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; -use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; -use pageserver::tenant::{secondary, TenantSharedResources}; -use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener}; +use pageserver::task_mgr::{ + BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, +}; +use pageserver::tenant::{TenantSharedResources, mgr, secondary}; +use pageserver::{ + CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http, + page_cache, page_service, task_mgr, virtual_file, +}; +use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; - -use metrics::set_build_info_metric; -use pageserver::{ - config::PageServerConf, - deletion_queue::DeletionQueue, - http, page_cache, page_service, task_mgr, - task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME}, - tenant::mgr, - virtual_file, -}; -use postgres_backend::AuthType; +use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::crashsafe::syncfs; -use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; -use utils::{ - auth::{JwtAuth, SwappableJwtAuth}, - logging, project_build_tag, project_git_version, - sentry_init::init_sentry, - tcp_listener, -}; +use utils::sentry_init::init_sentry; +use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); @@ -57,7 +50,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "pageserver.pid"; @@ -85,6 +78,9 @@ fn main() -> anyhow::Result<()> { return Ok(()); } + // Initialize up failpoints support + let scenario = failpoint_support::init(); + let workdir = arg_matches .get_one::("workdir") .map(Utf8Path::new) @@ -114,6 +110,7 @@ fn main() -> anyhow::Result<()> { } else { TracingErrorLayerEnablement::Disabled }; + logging::init( conf.log_format, tracing_error_layer_enablement, @@ -178,9 +175,6 @@ fn main() -> anyhow::Result<()> { } } - // Initialize up failpoints support - let scenario = failpoint_support::init(); - // Basic initialization of things that don't change after startup tracing::info!("Initializing virtual_file..."); virtual_file::init( @@ -217,7 +211,9 @@ fn initialize_config( Ok(mut f) => { let md = f.metadata().context("stat config file")?; if !md.is_file() { - anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..."); + anyhow::bail!( + "Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..." + ); } let mut s = String::new(); @@ -225,7 +221,9 @@ fn initialize_config( toml_edit::de::from_str::(&s)? } Err(e) => { - anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..."); + anyhow::bail!( + "Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..." + ); } }; @@ -346,11 +344,25 @@ fn start_pageserver( info!("Starting pageserver http handler on {http_addr}"); let http_listener = tcp_listener::bind(http_addr)?; - let pg_addr = &conf.listen_pg_addr; + let https_listener = match conf.listen_https_addr.as_ref() { + Some(https_addr) => { + info!("Starting pageserver https handler on {https_addr}"); + Some(tcp_listener::bind(https_addr)?) + } + None => None, + }; + let pg_addr = &conf.listen_pg_addr; info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; + // Enable SO_KEEPALIVE on the socket, to detect dead connections faster. + // These are configured via net.ipv4.tcp_keepalive_* sysctls. + // + // TODO: also set this on the walreceiver socket, but tokio-postgres doesn't + // support enabling keepalives while using the default OS sysctls. + setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?; + // Launch broker client // The storage_broker::connect call needs to happen inside a tokio runtime thread. let broker_client = WALRECEIVER_RUNTIME @@ -401,11 +413,9 @@ fn start_pageserver( Err(VarError::NotPresent) => { info!("No JWT token for authentication with Safekeeper detected"); } - Err(e) => { - return Err(e).with_context(|| { - "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable" - }) - } + Err(e) => return Err(e).with_context( + || "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable", + ), }; // Top-level cancellation token for the process @@ -573,9 +583,8 @@ fn start_pageserver( // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. - let http_endpoint_listener = { + let (http_endpoint_listener, https_endpoint_listener) = { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper - let cancel = CancellationToken::new(); let router_state = Arc::new( http::routes::State::new( @@ -590,22 +599,51 @@ fn start_pageserver( ) .context("Failed to initialize router state")?, ); + let router = http::make_router(router_state, launch_ts, http_auth.clone())? .build() .map_err(|err| anyhow!(err))?; - let service = http_utils::RouterService::new(router).unwrap(); - let server = hyper0::Server::from_tcp(http_listener)? - .serve(service) - .with_graceful_shutdown({ - let cancel = cancel.clone(); - async move { cancel.clone().cancelled().await } - }); - let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( - "http endpoint listener", - server, - )); - HttpEndpointListener(CancellableTask { task, cancel }) + let service = + Arc::new(http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow!(err))?); + + let http_task = { + let server = + http_utils::server::Server::new(Arc::clone(&service), http_listener, None)?; + let cancel = CancellationToken::new(); + + let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "http endpoint listener", + server.serve(cancel.clone()), + )); + HttpEndpointListener(CancellableTask { task, cancel }) + }; + + let https_task = match https_listener { + Some(https_listener) => { + let certs = http_utils::tls_certs::load_cert_chain(&conf.ssl_cert_file)?; + let key = http_utils::tls_certs::load_private_key(&conf.ssl_key_file)?; + + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_single_cert(certs, key)?; + + let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + + let server = + http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?; + let cancel = CancellationToken::new(); + + let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "https endpoint listener", + server.serve(cancel.clone()), + )); + Some(HttpsEndpointListener(CancellableTask { task, cancel })) + } + None => None, + }; + + (http_task, https_task) }; let consumption_metrics_tasks = { @@ -681,6 +719,7 @@ fn start_pageserver( shutdown_pageserver.cancel(); pageserver::shutdown_pageserver( http_endpoint_listener, + https_endpoint_listener, page_service, consumption_metrics_tasks, disk_usage_eviction_task, @@ -711,7 +750,9 @@ async fn create_remote_storage_client( // wrapper that simulates failures. if conf.test_remote_failures > 0 { if !cfg!(feature = "testing") { - anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"); + anyhow::bail!( + "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature" + ); } info!( "Simulating remote failures for first {} attempts of each op", diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs index c1ce332b6c..0215dd06fb 100644 --- a/pageserver/src/bin/test_helper_slow_client_reads.rs +++ b/pageserver/src/bin/test_helper_slow_client_reads.rs @@ -1,14 +1,10 @@ -use std::{ - io::{stdin, stdout, Read, Write}, - time::Duration, -}; +use std::io::{Read, Write, stdin, stdout}; +use std::time::Duration; use clap::Parser; use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; #[derive(clap::Parser)] struct Args { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 09d9444dd5..562a16a14e 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,36 +4,29 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. -use anyhow::{bail, ensure, Context}; -use pageserver_api::models::ImageCompressionAlgorithm; -use pageserver_api::{ - config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}, - shard::TenantShardId, -}; -use remote_storage::{RemotePath, RemoteStorageConfig}; use std::env; -use storage_broker::Uri; -use utils::logging::SecretString; -use utils::postgres_client::PostgresClientProtocol; - -use once_cell::sync::OnceCell; -use reqwest::Url; use std::num::NonZeroUsize; use std::sync::Arc; use std::time::Duration; +use anyhow::{Context, bail, ensure}; use camino::{Utf8Path, Utf8PathBuf}; +use once_cell::sync::OnceCell; +use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::shard::TenantShardId; use postgres_backend::AuthType; -use utils::{ - id::{NodeId, TimelineId}, - logging::LogFormat, -}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use reqwest::Url; +use storage_broker::Uri; +use utils::id::{NodeId, TimelineId}; +use utils::logging::{LogFormat, SecretString}; +use utils::postgres_client::PostgresClientProtocol; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; -use crate::virtual_file; use crate::virtual_file::io_engine; -use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME}; +use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, virtual_file}; /// Global state of pageserver. /// @@ -60,6 +53,11 @@ pub struct PageServerConf { pub listen_pg_addr: String, /// Example (default): 127.0.0.1:9898 pub listen_http_addr: String, + /// Example: 127.0.0.1:9899 + pub listen_https_addr: Option, + + pub ssl_key_file: Utf8PathBuf, + pub ssl_cert_file: Utf8PathBuf, /// Current availability zone. Used for traffic metrics. pub availability_zone: Option, @@ -201,6 +199,13 @@ pub struct PageServerConf { /// Interpreted protocol feature: if enabled, validate that the logical WAL received from /// safekeepers does not have gaps. pub validate_wal_contiguity: bool, + + /// When set, the previously written to disk heatmap is loaded on tenant attach and used + /// to avoid clobbering the heatmap from new, cold, attached locations. + pub load_previous_heatmap: bool, + + /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline. + pub generate_unarchival_heatmap: bool, } /// Token for authentication to safekeepers @@ -317,6 +322,9 @@ impl PageServerConf { let pageserver_api::config::ConfigToml { listen_pg_addr, listen_http_addr, + listen_https_addr, + ssl_key_file, + ssl_cert_file, availability_zone, wait_lsn_timeout, wal_redo_timeout, @@ -365,6 +373,8 @@ impl PageServerConf { get_vectored_concurrent_io, enable_read_path_debugging, validate_wal_contiguity, + load_previous_heatmap, + generate_unarchival_heatmap, } = config_toml; let mut conf = PageServerConf { @@ -373,6 +383,9 @@ impl PageServerConf { // ------------------------------------------------------------ listen_pg_addr, listen_http_addr, + listen_https_addr, + ssl_key_file, + ssl_cert_file, availability_zone, wait_lsn_timeout, wal_redo_timeout, @@ -440,7 +453,9 @@ impl PageServerConf { io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise io_engine::FeatureTestResult::Worse { engine, remark } => { // TODO: bubble this up to the caller so we can tracing::warn! it. - eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); + eprintln!( + "auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}" + ); engine } }, @@ -452,6 +467,8 @@ impl PageServerConf { no_sync: no_sync.unwrap_or(false), enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false), validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false), + load_previous_heatmap: load_previous_heatmap.unwrap_or(true), + generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true), }; // ------------------------------------------------------------ @@ -485,7 +502,9 @@ impl PageServerConf { #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf { let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into()); - Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}")) + + let test_id = uuid::Uuid::new_v4(); + Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}")) } pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { @@ -498,6 +517,8 @@ impl PageServerConf { metric_collection_interval: Duration::from_secs(60), synthetic_size_calculation_interval: Duration::from_secs(60), background_task_maximum_delay: Duration::ZERO, + load_previous_heatmap: Some(true), + generate_unarchival_heatmap: Some(true), ..Default::default() }; PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 7e8c00c293..0231190e69 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,13 +1,9 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. -use crate::config::PageServerConf; -use crate::consumption_metrics::metrics::MetricsKey; -use crate::consumption_metrics::upload::KeyGen as _; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::size::CalculateSyntheticSizeError; -use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; + use camino::Utf8PathBuf; use consumption_metrics::EventType; use itertools::Itertools as _; @@ -15,14 +11,21 @@ use pageserver_api::models::TenantState; use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::{Duration, SystemTime}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::NodeId; +use crate::config::PageServerConf; +use crate::consumption_metrics::metrics::MetricsKey; +use crate::consumption_metrics::upload::KeyGen as _; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; +use crate::tenant::mgr::TenantManager; +use crate::tenant::size::CalculateSyntheticSizeError; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::{LogicalSizeCalculationCause, Tenant}; + mod disk_cache; mod metrics; mod upload; diff --git a/pageserver/src/consumption_metrics/disk_cache.rs b/pageserver/src/consumption_metrics/disk_cache.rs index 54a505a134..f1dad8793d 100644 --- a/pageserver/src/consumption_metrics/disk_cache.rs +++ b/pageserver/src/consumption_metrics/disk_cache.rs @@ -1,10 +1,10 @@ -use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; use std::sync::Arc; -use crate::consumption_metrics::NewMetricsRefRoot; +use anyhow::Context; +use camino::{Utf8Path, Utf8PathBuf}; use super::{NewMetricsRoot, NewRawMetric, RawMetric}; +use crate::consumption_metrics::NewMetricsRefRoot; pub(super) fn read_metrics_from_serde_value( json_value: serde_json::Value, diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 07fac09f6f..71910011ea 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -1,15 +1,16 @@ -use crate::tenant::mgr::TenantManager; -use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize}; +use std::sync::Arc; +use std::time::SystemTime; + use chrono::{DateTime, Utc}; use consumption_metrics::EventType; use futures::stream::StreamExt; -use std::{sync::Arc, time::SystemTime}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use super::{Cache, NewRawMetric}; +use crate::context::RequestContext; +use crate::tenant::mgr::TenantManager; +use crate::tenant::timeline::logical_size::CurrentLogicalSize; /// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events` /// instead of static str. diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 3ed7b44123..52b4fb8680 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -1,7 +1,7 @@ -use crate::consumption_metrics::RawMetric; +use std::collections::HashMap; use super::*; -use std::collections::HashMap; +use crate::consumption_metrics::RawMetric; #[test] fn startup_collected_timeline_metrics_before_advancing() { diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 448bf47525..59e0145a5b 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -2,15 +2,16 @@ use std::error::Error as _; use std::time::SystemTime; use chrono::{DateTime, Utc}; -use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE}; +use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, IdempotencyKey}; use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::Instrument; - -use super::{metrics::Name, Cache, MetricsKey, NewRawMetric, RawMetric}; use utils::id::{TenantId, TimelineId}; +use super::metrics::Name; +use super::{Cache, MetricsKey, NewRawMetric, RawMetric}; + /// How the metrics from pageserver are identified. #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)] struct Ids { @@ -438,14 +439,13 @@ async fn upload( #[cfg(test)] mod tests { - use crate::consumption_metrics::{ - disk_cache::read_metrics_from_serde_value, NewMetricsRefRoot, - }; - - use super::*; use chrono::{DateTime, Utc}; use once_cell::sync::Lazy; + use super::*; + use crate::consumption_metrics::NewMetricsRefRoot; + use crate::consumption_metrics::disk_cache::read_metrics_from_serde_value; + #[test] fn chunked_serialization() { let examples = metric_samples(); diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index da9c095a15..e2a84d0c24 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -89,16 +89,112 @@ //! [`RequestContext`] argument. Functions in the middle of the call chain //! only need to pass it on. -use crate::task_mgr::TaskKind; +use std::sync::Arc; + +use once_cell::sync::Lazy; +use tracing::warn; +use utils::{id::TimelineId, shard::TenantShardId}; + +use crate::{ + metrics::{StorageIoSizeMetrics, TimelineMetrics}, + task_mgr::TaskKind, + tenant::Timeline, +}; // The main structure of this module, see module-level comment. -#[derive(Debug)] pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, read_path_debug: bool, + scope: Scope, +} + +#[derive(Clone)] +pub(crate) enum Scope { + Global { + io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, + SecondaryTenant { + io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, + SecondaryTimeline { + io_size_metrics: crate::metrics::StorageIoSizeMetrics, + }, + Timeline { + // We wrap the `Arc`s inside another Arc to avoid child + // context creation contending for the ref counters of the Arc, + // which are shared among all tasks that operate on the timeline, especially + // concurrent page_service connections. + #[allow(clippy::redundant_allocation)] + arc_arc: Arc>, + }, + #[cfg(test)] + UnitTest { + io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, +} + +static GLOBAL_IO_SIZE_METRICS: Lazy = + Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*")); + +impl Scope { + pub(crate) fn new_global() -> Self { + Scope::Global { + io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } + } + /// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start + /// of a compaction iteration. + pub(crate) fn new_timeline(timeline: &Timeline) -> Self { + Scope::Timeline { + arc_arc: Arc::new(Arc::clone(&timeline.metrics)), + } + } + pub(crate) fn new_page_service_pagestream( + timeline_handle: &crate::tenant::timeline::handle::Handle< + crate::page_service::TenantManagerTypes, + >, + ) -> Self { + Scope::Timeline { + arc_arc: Arc::clone(&timeline_handle.metrics), + } + } + pub(crate) fn new_secondary_timeline( + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + ) -> Self { + // TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle. + + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = tenant_shard_id.shard_slug().to_string(); + let timeline_id = timeline_id.to_string(); + + let io_size_metrics = + crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id); + Scope::SecondaryTimeline { io_size_metrics } + } + pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self { + // Before propagating metrics via RequestContext, the labels were inferred from file path. + // The only user of VirtualFile at tenant scope is the heatmap download & read. + // The inferred labels for the path of the heatmap file on local disk were that of the global metric (*,*,*). + // Thus, we do the same here, and extend that for anything secondary-tenant scoped. + // + // If we want to have (tenant_id, shard_id, '*') labels for secondary tenants in the future, + // we will need to think about the metric lifecycle, i.e., remove them during secondary tenant shutdown, + // like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile + // at this point, so, we were able to completely side-step tenant-scoped stuff there). + Scope::SecondaryTenant { + io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } + } + #[cfg(test)] + pub(crate) fn new_unit_test() -> Self { + Scope::UnitTest { + io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } + } } /// The kind of access to the page cache. @@ -157,6 +253,7 @@ impl RequestContextBuilder { access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, read_path_debug: false, + scope: Scope::new_global(), }, } } @@ -171,10 +268,16 @@ impl RequestContextBuilder { access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, read_path_debug: original.read_path_debug, + scope: original.scope.clone(), }, } } + pub fn task_kind(mut self, k: TaskKind) -> Self { + self.inner.task_kind = k; + self + } + /// Configure the DownloadBehavior of the context: whether to /// download missing layers, and/or warn on the download. pub fn download_behavior(mut self, b: DownloadBehavior) -> Self { @@ -199,6 +302,11 @@ impl RequestContextBuilder { self } + pub(crate) fn scope(mut self, s: Scope) -> Self { + self.inner.scope = s; + self + } + pub fn build(self) -> RequestContext { self.inner } @@ -281,7 +389,50 @@ impl RequestContext { } fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { - Self::new(task_kind, download_behavior) + RequestContextBuilder::extend(self) + .task_kind(task_kind) + .download_behavior(download_behavior) + .build() + } + + pub fn with_scope_timeline(&self, timeline: &Arc) -> Self { + RequestContextBuilder::extend(self) + .scope(Scope::new_timeline(timeline)) + .build() + } + + pub(crate) fn with_scope_page_service_pagestream( + &self, + timeline_handle: &crate::tenant::timeline::handle::Handle< + crate::page_service::TenantManagerTypes, + >, + ) -> Self { + RequestContextBuilder::extend(self) + .scope(Scope::new_page_service_pagestream(timeline_handle)) + .build() + } + + pub fn with_scope_secondary_timeline( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + ) -> Self { + RequestContextBuilder::extend(self) + .scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id)) + .build() + } + + pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self { + RequestContextBuilder::extend(self) + .scope(Scope::new_secondary_tenant(tenant_shard_id)) + .build() + } + + #[cfg(test)] + pub fn with_scope_unit_test(&self) -> Self { + RequestContextBuilder::new(TaskKind::UnitTest) + .scope(Scope::new_unit_test()) + .build() } pub fn task_kind(&self) -> TaskKind { @@ -303,4 +454,38 @@ impl RequestContext { pub(crate) fn read_path_debug(&self) -> bool { self.read_path_debug } + + pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics { + match &self.scope { + Scope::Global { io_size_metrics } => { + let is_unit_test = cfg!(test); + let is_regress_test_build = cfg!(feature = "testing"); + if is_unit_test || is_regress_test_build { + panic!("all VirtualFile instances are timeline-scoped"); + } else { + use once_cell::sync::Lazy; + use std::sync::Mutex; + use std::time::Duration; + use utils::rate_limit::RateLimit; + static LIMIT: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1)))); + let mut guard = LIMIT.lock().unwrap(); + guard.call2(|rate_limit_stats| { + warn!( + %rate_limit_stats, + backtrace=%std::backtrace::Backtrace::force_capture(), + "all VirtualFile instances are timeline-scoped", + ); + }); + + io_size_metrics + } + } + Scope::Timeline { arc_arc } => &arc_arc.storage_io_size, + Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics, + Scope::SecondaryTenant { io_size_metrics } => io_size_metrics, + #[cfg(test)] + Scope::UnitTest { io_size_metrics } => io_size_metrics, + } + } } diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 4990f17b40..745d04cf62 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -1,21 +1,23 @@ use std::collections::HashMap; use futures::Future; -use pageserver_api::{ - controller_api::{AvailabilityZone, NodeRegisterRequest}, - shard::TenantShardId, - upcall_api::{ - ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, - ValidateRequestTenant, ValidateResponse, - }, +use pageserver_api::config::NodeMetadata; +use pageserver_api::controller_api::{AvailabilityZone, NodeRegisterRequest}; +use pageserver_api::shard::TenantShardId; +use pageserver_api::upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateRequestTenant, ValidateResponse, }; -use serde::{de::DeserializeOwned, Serialize}; +use serde::Serialize; +use serde::de::DeserializeOwned; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; +use utils::generation::Generation; +use utils::id::NodeId; +use utils::{backoff, failpoint_support}; -use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; -use pageserver_api::config::NodeMetadata; +use crate::config::PageServerConf; +use crate::virtual_file::on_fatal_io_error; /// The Pageserver's client for using the storage controller upcall API: this is a small API /// for dealing with generations (see docs/rfcs/025-generation-numbers.md). @@ -82,6 +84,7 @@ impl ControllerUpcallClient { }) } + #[tracing::instrument(skip_all)] async fn retry_http_forever( &self, url: &url::Url, @@ -106,7 +109,7 @@ impl ControllerUpcallClient { |_| false, 3, u32::MAX, - "calling control plane generation validation API", + "storage controller upcall", &self.cancel, ) .await @@ -123,11 +126,12 @@ impl ControllerUpcallClient { impl ControlPlaneGenerationsApi for ControllerUpcallClient { /// Block until we get a successful response, or error out if we are shut down + #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn re_attach( &self, conf: &PageServerConf, ) -> Result, RetryForeverError> { - let re_attach_path = self + let url = self .base_url .join("re-attach") .expect("Failed to build re-attach path"); @@ -157,14 +161,18 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { match az_id_from_metadata { Some(az_id) => Some(AvailabilityZone(az_id)), None => { - tracing::warn!("metadata.json does not contain an 'availability_zone_id' field"); + tracing::warn!( + "metadata.json does not contain an 'availability_zone_id' field" + ); conf.availability_zone.clone().map(AvailabilityZone) } } }; if az_id.is_none() { - panic!("Availablity zone id could not be inferred from metadata.json or pageserver config"); + panic!( + "Availablity zone id could not be inferred from metadata.json or pageserver config" + ); } Some(NodeRegisterRequest { @@ -173,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { listen_pg_port: m.postgres_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, - listen_https_port: None, // TODO: Support https. + listen_https_port: m.https_port, availability_zone_id: az_id.expect("Checked above"), }) } @@ -199,7 +207,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { register: register.clone(), }; - let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; + let response: ReAttachResponse = self.retry_http_forever(&url, request).await?; tracing::info!( "Received re-attach response with {} tenants (node {}, register: {:?})", response.tenants.len(), @@ -217,11 +225,12 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { } /// Block until we get a successful response, or error out if we are shut down + #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, ) -> Result, RetryForeverError> { - let re_attach_path = self + let url = self .base_url .join("validate") .expect("Failed to build validate path"); @@ -236,7 +245,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { .iter() .map(|(id, generation)| ValidateRequestTenant { id: *id, - gen: (*generation).into().expect( + r#gen: (*generation).into().expect( "Generation should always be valid for a Tenant doing deletions", ), }) @@ -251,8 +260,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { return Err(RetryForeverError::ShuttingDown); } - let response: ValidateResponse = - self.retry_http_forever(&re_attach_path, request).await?; + let response: ValidateResponse = self.retry_http_forever(&url, request).await?; for rt in response.tenants { result.insert(rt.id, rt.valid); } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index a2395b0dca..8118f66252 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -6,38 +6,31 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use crate::controller_upcall_client::ControlPlaneGenerationsApi; -use crate::metrics; -use crate::tenant::remote_timeline_client::remote_timeline_path; -use crate::tenant::remote_timeline_client::LayerFileMetadata; -use crate::virtual_file::MaybeFatalIo; -use crate::virtual_file::VirtualFile; use anyhow::Context; use camino::Utf8PathBuf; +use deleter::DeleterMessage; +use list_writer::ListWriterQueueMessage; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, RemotePath}; -use serde::Deserialize; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio_util::sync::CancellationToken; -use tracing::Instrument; -use tracing::{debug, error}; +use tracing::{Instrument, debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; use utils::id::TimelineId; -use utils::lsn::AtomicLsn; -use utils::lsn::Lsn; - -use self::deleter::Deleter; -use self::list_writer::DeletionOp; -use self::list_writer::ListWriter; -use self::list_writer::RecoverOp; -use self::validator::Validator; -use deleter::DeleterMessage; -use list_writer::ListWriterQueueMessage; +use utils::lsn::{AtomicLsn, Lsn}; use validator::ValidatorQueueMessage; -use crate::{config::PageServerConf, tenant::storage_layer::LayerName}; +use self::deleter::Deleter; +use self::list_writer::{DeletionOp, ListWriter, RecoverOp}; +use self::validator::Validator; +use crate::config::PageServerConf; +use crate::controller_upcall_client::ControlPlaneGenerationsApi; +use crate::metrics; +use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path}; +use crate::tenant::storage_layer::LayerName; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; // TODO: configurable for how long to wait before executing deletions @@ -664,21 +657,22 @@ impl DeletionQueue { #[cfg(test)] mod test { + use std::io::ErrorKind; + use std::time::Duration; + use camino::Utf8Path; use hex_literal::hex; - use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant}; - use std::{io::ErrorKind, time::Duration}; - use tracing::info; - + use pageserver_api::key::Key; + use pageserver_api::shard::ShardIndex; + use pageserver_api::upcall_api::ReAttachResponseTenant; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; use tokio::task::JoinHandle; - - use crate::{ - controller_upcall_client::RetryForeverError, - tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, - }; + use tracing::info; use super::*; + use crate::controller_upcall_client::RetryForeverError; + use crate::tenant::harness::TenantHarness; + use crate::tenant::storage_layer::DeltaLayerName; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -724,26 +718,26 @@ mod test { .expect("Failed to join workers for previous deletion queue"); } - fn set_latest_generation(&self, gen: Generation) { + fn set_latest_generation(&self, gen_: Generation) { let tenant_shard_id = self.harness.tenant_shard_id; self.mock_control_plane .latest_generation .lock() .unwrap() - .insert(tenant_shard_id, gen); + .insert(tenant_shard_id, gen_); } /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, file_name: LayerName, - gen: Generation, + gen_: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path()); std::fs::create_dir_all(&remote_timeline_path)?; - let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix()); + let remote_layer_file_name = format!("{}{}", file_name, gen_.get_suffix()); let content: Vec = format!("placeholder contents of {file_name}").into(); @@ -1098,11 +1092,12 @@ mod test { /// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it. #[cfg(test)] pub(crate) mod mock { + use std::sync::atomic::{AtomicUsize, Ordering}; + use tracing::info; use super::*; use crate::tenant::remote_timeline_client::remote_layer_path; - use std::sync::atomic::{AtomicUsize, Ordering}; pub struct ConsumerState { rx: tokio::sync::mpsc::UnboundedReceiver, diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index ef1dfbac19..691ba75cc7 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -6,21 +6,16 @@ //! number of full-sized DeleteObjects requests, rather than a larger number of //! smaller requests. -use remote_storage::GenericRemoteStorage; -use remote_storage::RemotePath; -use remote_storage::TimeoutOrCancel; use std::time::Duration; + +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use tokio_util::sync::CancellationToken; -use tracing::info; -use tracing::warn; -use utils::backoff; -use utils::pausable_failpoint; +use tracing::{info, warn}; +use utils::{backoff, pausable_failpoint}; +use super::{DeletionQueueError, FlushOp}; use crate::metrics; -use super::DeletionQueueError; -use super::FlushOp; - const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10); pub(super) enum DeleterMessage { diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index ae3b2c9180..a385e35a02 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -10,11 +10,6 @@ //! //! DeletionLists are passed onwards to the Validator. -use super::DeletionHeader; -use super::DeletionList; -use super::FlushOp; -use super::ValidatorQueueMessage; - use std::collections::HashMap; use std::fs::create_dir_all; use std::time::Duration; @@ -23,20 +18,17 @@ use pageserver_api::shard::TenantShardId; use regex::Regex; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; -use tracing::debug; -use tracing::info; -use tracing::warn; +use tracing::{debug, info, warn}; use utils::generation::Generation; use utils::id::TimelineId; +use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage}; use crate::config::PageServerConf; use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; -use crate::tenant::remote_timeline_client::remote_layer_path; -use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path}; use crate::tenant::storage_layer::LayerName; -use crate::virtual_file::on_fatal_io_error; -use crate::virtual_file::MaybeFatalIo; +use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error}; // The number of keys in a DeletionList before we will proactively persist it // (without reaching a flush deadline). This aims to deliver objects of the order diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index 1d55581ebd..b0ce2b80b4 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -20,22 +20,14 @@ use std::time::Duration; use camino::Utf8PathBuf; use tokio_util::sync::CancellationToken; -use tracing::debug; -use tracing::info; -use tracing::warn; - -use crate::config::PageServerConf; -use crate::controller_upcall_client::ControlPlaneGenerationsApi; -use crate::controller_upcall_client::RetryForeverError; -use crate::metrics; -use crate::virtual_file::MaybeFatalIo; +use tracing::{debug, info, warn}; use super::deleter::DeleterMessage; -use super::DeletionHeader; -use super::DeletionList; -use super::DeletionQueueError; -use super::FlushOp; -use super::VisibleLsnUpdates; +use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates}; +use crate::config::PageServerConf; +use crate::controller_upcall_client::{ControlPlaneGenerationsApi, RetryForeverError}; +use crate::metrics; +use crate::virtual_file::MaybeFatalIo; // After this length of time, do any validation work that is pending, // even if we haven't accumulated many keys to delete. @@ -190,7 +182,10 @@ where } } else { // If we failed validation, then do not apply any of the projected updates - info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + info!( + "Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", + tenant_lsn_state.generation + ); metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 738a783813..13252037e5 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -41,30 +41,31 @@ // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl // reading these fields. We use the Debug impl for semi-structured logging, though. -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId}; +use pageserver_api::config::DiskUsageEvictionTaskConfig; +use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use serde::Serialize; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::{completion, id::TimelineId}; +use tracing::{Instrument, debug, error, info, instrument, warn}; +use utils::completion; +use utils::id::TimelineId; -use crate::{ - config::PageServerConf, - metrics::disk_usage_based_eviction::METRICS, - task_mgr::{self, BACKGROUND_RUNTIME}, - tenant::{ - mgr::TenantManager, - remote_timeline_client::LayerFileMetadata, - secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint}, - tasks::sleep_random, - }, - CancellableTask, DiskUsageEvictionTask, +use crate::config::PageServerConf; +use crate::metrics::disk_usage_based_eviction::METRICS; +use crate::task_mgr::{self, BACKGROUND_RUNTIME}; +use crate::tenant::mgr::TenantManager; +use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::secondary::SecondaryTenant; +use crate::tenant::storage_layer::{ + AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint, }; +use crate::tenant::tasks::sleep_random; +use crate::{CancellableTask, DiskUsageEvictionTask}; /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. @@ -1007,10 +1008,14 @@ async fn collect_eviction_candidates( } } - debug_assert!(EvictionPartition::Above < EvictionPartition::Below, - "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above, - "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + debug_assert!( + EvictionPartition::Above < EvictionPartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first" + ); + debug_assert!( + EvictionPartition::EvictNow < EvictionPartition::Above, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first" + ); eviction_order.sort(&mut candidates); @@ -1157,9 +1162,8 @@ mod filesystem_level_usage { use anyhow::Context; use camino::Utf8Path; - use crate::statvfs::Statvfs; - use super::DiskUsageEvictionTaskConfig; + use crate::statvfs::Statvfs; #[derive(Debug, Clone, Copy)] pub struct Usage<'a> { @@ -1224,10 +1228,12 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { - use super::Usage as _; use std::time::Duration; + use utils::serde_percent::Percent; + use super::Usage as _; + let mut usage = Usage { config: &DiskUsageEvictionTaskConfig { max_usage_pct: Percent::new(85).unwrap(), diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 12252739fd..e799efcce3 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -842,6 +842,12 @@ paths: required: false schema: type: integer + - name: recurse + description: When set, will recurse with the downloads into ancestor timelines + in: query + required: false + schema: + type: boolean post: description: | Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter @@ -1073,7 +1079,6 @@ components: - last_record_lsn - disk_consistent_lsn - state - - latest_gc_cutoff_lsn properties: timeline_id: type: string @@ -1117,9 +1122,6 @@ components: min_readable_lsn: type: string format: hex - latest_gc_cutoff_lsn: - type: string - format: hex applied_gc_cutoff_lsn: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 56a84a98a8..e8a32ca1ef 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2,125 +2,86 @@ //! Management HTTP API //! use std::cmp::Reverse; -use std::collections::BinaryHeap; -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use enumset::EnumSet; use futures::future::join_all; -use futures::StreamExt; -use futures::TryFutureExt; +use futures::{StreamExt, TryFutureExt}; use http_utils::endpoint::{ - profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, + self, attach_openapi_ui, auth_middleware, check_permission_with, profile_cpu_handler, + profile_heap_handler, prometheus_metrics_handler, request_span, }; +use http_utils::error::{ApiError, HttpErrorBody}; use http_utils::failpoints::failpoints_handler; -use http_utils::request::must_parse_query_param; -use http_utils::request::{get_request_param, must_get_query_param, parse_query_param}; +use http_utils::json::{json_request, json_request_maybe, json_response}; +use http_utils::request::{ + get_request_param, must_get_query_param, must_parse_query_param, parse_query_param, + parse_request_param, +}; +use http_utils::{RequestExt, RouterBuilder}; use humantime::format_rfc3339; -use hyper::header; -use hyper::StatusCode; -use hyper::{Body, Request, Response, Uri}; +use hyper::{Body, Request, Response, StatusCode, Uri, header}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::virtual_file::IoMode; -use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; -use pageserver_api::models::IngestAuxFilesRequest; -use pageserver_api::models::ListAuxFilesRequest; -use pageserver_api::models::LocationConfig; -use pageserver_api::models::LocationConfigListResponse; -use pageserver_api::models::LocationConfigMode; -use pageserver_api::models::LsnLease; -use pageserver_api::models::LsnLeaseRequest; -use pageserver_api::models::OffloadedTimelineInfo; -use pageserver_api::models::PageTraceEvent; -use pageserver_api::models::ShardParameters; -use pageserver_api::models::TenantConfigPatchRequest; -use pageserver_api::models::TenantDetails; -use pageserver_api::models::TenantLocationConfigRequest; -use pageserver_api::models::TenantLocationConfigResponse; -use pageserver_api::models::TenantScanRemoteStorageResponse; -use pageserver_api::models::TenantScanRemoteStorageShard; -use pageserver_api::models::TenantShardLocation; -use pageserver_api::models::TenantShardSplitRequest; -use pageserver_api::models::TenantShardSplitResponse; -use pageserver_api::models::TenantSorting; -use pageserver_api::models::TenantState; -use pageserver_api::models::TenantWaitLsnRequest; -use pageserver_api::models::TimelineArchivalConfigRequest; -use pageserver_api::models::TimelineCreateRequestMode; -use pageserver_api::models::TimelineCreateRequestModeImportPgdata; -use pageserver_api::models::TimelinesInfoAndOffloaded; -use pageserver_api::models::TopTenantShardItem; -use pageserver_api::models::TopTenantShardsRequest; -use pageserver_api::models::TopTenantShardsResponse; -use pageserver_api::shard::ShardCount; -use pageserver_api::shard::TenantShardId; -use remote_storage::DownloadError; -use remote_storage::GenericRemoteStorage; -use remote_storage::TimeTravelError; +use pageserver_api::models::{ + DetachBehavior, DownloadRemoteLayersTaskSpawnRequest, IngestAuxFilesRequest, + ListAuxFilesRequest, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, + LsnLeaseRequest, OffloadedTimelineInfo, PageTraceEvent, ShardParameters, StatusResponse, + TenantConfigPatchRequest, TenantConfigRequest, TenantDetails, TenantInfo, + TenantLocationConfigRequest, TenantLocationConfigResponse, TenantScanRemoteStorageResponse, + TenantScanRemoteStorageShard, TenantShardLocation, TenantShardSplitRequest, + TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest, + TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode, + TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo, + TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem, + TopTenantShardsRequest, TopTenantShardsResponse, +}; +use pageserver_api::shard::{ShardCount, TenantShardId}; +use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; use scopeguard::defer; -use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel}; +use tenant_size_model::svg::SvgBranchKind; +use tenant_size_model::{SizeResult, StorageModel}; use tokio::time::Instant; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::auth::SwappableJwtAuth; +use utils::generation::Generation; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::config::PageServerConf; -use crate::context::RequestContextBuilder; -use crate::context::{DownloadBehavior, RequestContext}; +use crate::context; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; -use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::{ - GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError, - TenantSlotUpsertError, TenantStateError, + GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, + TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError, +}; +use crate::tenant::remote_timeline_client::index::GcCompactionState; +use crate::tenant::remote_timeline_client::{ + download_index_part, list_remote_tenant_shards, list_remote_timelines, }; -use crate::tenant::mgr::{TenantSlot, UpsertLocationError}; -use crate::tenant::remote_timeline_client; -use crate::tenant::remote_timeline_client::download_index_part; -use crate::tenant::remote_timeline_client::list_remote_tenant_shards; -use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; -use crate::tenant::storage_layer::IoConcurrency; -use crate::tenant::storage_layer::LayerAccessStatsReset; -use crate::tenant::storage_layer::LayerName; -use crate::tenant::timeline::import_pgdata; -use crate::tenant::timeline::offload::offload_timeline; -use crate::tenant::timeline::offload::OffloadError; -use crate::tenant::timeline::CompactFlags; -use crate::tenant::timeline::CompactOptions; -use crate::tenant::timeline::CompactRequest; -use crate::tenant::timeline::CompactionError; -use crate::tenant::timeline::Timeline; -use crate::tenant::timeline::WaitLsnTimeout; -use crate::tenant::timeline::WaitLsnWaiter; -use crate::tenant::GetTimelineError; -use crate::tenant::OffloadedTimeline; -use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; -use crate::DEFAULT_PG_VERSION; -use crate::{disk_usage_eviction_task, tenant}; -use http_utils::{ - endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, - error::{ApiError, HttpErrorBody}, - json::{json_request, json_request_maybe, json_response}, - request::parse_request_param, - RequestExt, RouterBuilder, +use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; +use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; +use crate::tenant::timeline::{ + CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout, + WaitLsnWaiter, import_pgdata, }; -use pageserver_api::models::{ - StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, - TimelineInfo, -}; -use utils::{ - auth::SwappableJwtAuth, - generation::Generation, - id::{TenantId, TimelineId}, - lsn::Lsn, +use crate::tenant::{ + GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError, + remote_timeline_client, }; +use crate::{DEFAULT_PG_VERSION, disk_usage_eviction_task, tenant}; // For APIs that require an Active tenant, how long should we block waiting for that state? // This is not functionally necessary (clients will retry), but avoids generating a lot of @@ -499,10 +460,7 @@ async fn build_timeline_info_common( initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), - // Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally - // we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we - // actually trimmed data to), which can pass each other when PITR is changed. - latest_gc_cutoff_lsn: min_readable_lsn, + _unused: Default::default(), // Unused, for legacy decode only min_readable_lsn, applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(), current_logical_size: current_logical_size.size_dont_care_about_accuracy(), @@ -523,6 +481,7 @@ async fn build_timeline_info_common( state, is_archived: Some(is_archived), + rel_size_migration: Some(timeline.get_rel_size_v2_status()), walreceiver_status, }; @@ -899,6 +858,75 @@ async fn timeline_archival_config_handler( json_response(StatusCode::OK, ()) } +/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency +/// measure only. +/// +/// Some examples of safe patches: +/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors. +/// - Force set the index part to use reldir v2 (migrating/migrated). +/// +/// Some examples of unsafe patches: +/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause +/// errors. +/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background. +async fn timeline_patch_index_part_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?; + check_permission(&request, None)?; // require global permission for this request + let state = get_state(&request); + + async { + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + if let Some(rel_size_migration) = request_data.rel_size_migration { + timeline + .update_rel_size_v2_status(rel_size_migration) + .map_err(ApiError::InternalServerError)?; + } + + if let Some(gc_compaction_last_completed_lsn) = + request_data.gc_compaction_last_completed_lsn + { + timeline + .update_gc_compaction_state(GcCompactionState { + last_completed_lsn: gc_compaction_last_completed_lsn, + }) + .map_err(ApiError::InternalServerError)?; + } + + if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn { + { + let guard = timeline.applied_gc_cutoff_lsn.lock_for_write(); + guard.store_and_unlock(applied_gc_cutoff_lsn); + } + } + + if request_data.force_index_update { + timeline + .remote_client + .force_schedule_index_upload() + .context("force schedule index upload") + .map_err(ApiError::InternalServerError)?; + } + + Ok::<_, ApiError>(()) + } + .instrument(info_span!("timeline_patch_index_part", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + %timeline_id)) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, @@ -923,12 +951,13 @@ async fn timeline_detail_handler( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let timeline = tenant.get_timeline(timeline_id, false)?; + let ctx = &ctx.with_scope_timeline(&timeline); let timeline_info = build_timeline_info( &timeline, include_non_incremental_logical_size.unwrap_or(false), force_await_initial_logical_size.unwrap_or(false), - &ctx, + ctx, ) .await .context("get local timeline info") @@ -969,11 +998,11 @@ async fn get_lsn_by_timestamp_handler( let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; @@ -1042,10 +1071,11 @@ async fn get_timestamp_of_lsn_handler( .with_context(|| format!("Invalid LSN: {lsn_str:?}")) .map_err(ApiError::BadRequest)?; - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?; match result { @@ -1128,12 +1158,12 @@ async fn tenant_list_handler( ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into()) })? .iter() - .map(|(id, state, gen)| TenantInfo { + .map(|(id, state, gen_)| TenantInfo { id: *id, state: state.clone(), current_physical_size: None, attachment_status: state.attachment_status(), - generation: (*gen) + generation: (*gen_) .into() .expect("Tenants are always attached with a generation"), gc_blocking: None, @@ -1400,7 +1430,8 @@ async fn timeline_layer_scan_disposable_keys( active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let guard = timeline.layers.read().await; let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else { @@ -1410,7 +1441,7 @@ async fn timeline_layer_scan_disposable_keys( }; let resident_layer = layer - .download_and_keep_resident() + .download_and_keep_resident(&ctx) .await .map_err(|err| match err { tenant::storage_layer::layer::DownloadError::TimelineShutdown @@ -1478,6 +1509,7 @@ async fn timeline_download_heatmap_layers_handler( let desired_concurrency = parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY); + let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; @@ -1485,6 +1517,8 @@ async fn timeline_download_heatmap_layers_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let max_concurrency = get_config(&request) .remote_storage_config @@ -1493,7 +1527,7 @@ async fn timeline_download_heatmap_layers_handler( .unwrap_or(DEFAULT_MAX_CONCURRENCY); let concurrency = std::cmp::min(max_concurrency, desired_concurrency); - timeline.start_heatmap_layers_download(concurrency).await?; + timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?; json_response(StatusCode::ACCEPTED, ()) } @@ -1532,8 +1566,10 @@ async fn layer_download_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let downloaded = timeline - .download_layer(&layer_name) + .download_layer(&layer_name, &ctx) .await .map_err(|e| match e { tenant::storage_layer::layer::DownloadError::TimelineShutdown @@ -1670,9 +1706,8 @@ async fn block_or_unblock_gc( request: Request, block: bool, ) -> Result, ApiError> { - use crate::tenant::{ - remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized, - }; + use crate::tenant::remote_timeline_client::WaitCompletionError; + use crate::tenant::upload_queue::NotInitialized; let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -2058,7 +2093,9 @@ async fn tenant_time_travel_remote_storage_handler( ))); } - tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}"); + tracing::info!( + "Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}" + ); remote_timeline_client::upload::time_travel_recover_tenant( &state.remote_storage, @@ -2266,8 +2303,8 @@ async fn timeline_compact_handler( .unwrap_or(false); async { - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline); if scheduled { let tenant = state .tenant_manager @@ -2354,6 +2391,7 @@ async fn timeline_checkpoint_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); + flags |= CompactFlags::NoYield; // run compaction to completion if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; } @@ -2374,8 +2412,8 @@ async fn timeline_checkpoint_handler( parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); async { - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline); if wait_until_flushed { timeline.freeze_and_flush().await } else { @@ -2396,7 +2434,8 @@ async fn timeline_checkpoint_handler( CompactionError::ShuttingDown => ApiError::ShuttingDown, CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), - CompactionError::Other(e) => ApiError::InternalServerError(e) + CompactionError::Other(e) => ApiError::InternalServerError(e), + CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)), } )?; } @@ -2429,7 +2468,9 @@ async fn timeline_download_remote_layers_handler_post( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - match timeline.spawn_download_all_remote_layers(body).await { + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); + match timeline.spawn_download_all_remote_layers(body, &ctx).await { Ok(st) => json_response(StatusCode::ACCEPTED, st), Err(st) => json_response(StatusCode::CONFLICT, st), } @@ -2458,12 +2499,16 @@ async fn timeline_detach_ancestor_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - use crate::tenant::timeline::detach_ancestor; use pageserver_api::models::detach_ancestor::AncestorDetached; + use crate::tenant::timeline::detach_ancestor; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let behavior: Option = parse_query_param(&request, "detach_behavior")?; + + let behavior = behavior.unwrap_or_default(); let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); @@ -2510,9 +2555,10 @@ async fn timeline_detach_ancestor_handler( tracing::info!("all timeline upload queues are drained"); let timeline = tenant.get_timeline(timeline_id, true)?; + let ctx = &ctx.with_scope_timeline(&timeline); let progress = timeline - .prepare_to_detach_from_ancestor(&tenant, options, ctx) + .prepare_to_detach_from_ancestor(&tenant, options, behavior, ctx) .await?; // uncomment to allow early as possible Tenant::drop @@ -2527,6 +2573,7 @@ async fn timeline_detach_ancestor_handler( tenant_shard_id, timeline_id, prepared, + behavior, attempt, ctx, ) @@ -2616,8 +2663,9 @@ async fn getpage_at_lsn_handler_inner( async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); // Enable read path debugging - let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build(); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true) + .scope(context::Scope::new_timeline(&timeline)).build(); // Use last_record_lsn if no lsn is provided let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); @@ -2651,8 +2699,8 @@ async fn timeline_collect_keyspace( let at_lsn: Option = parse_query_param(&request, "at_lsn")?; async { - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline); let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let (dense_ks, sparse_ks) = timeline .collect_keyspace(at_lsn, &ctx) @@ -2805,14 +2853,19 @@ async fn tenant_scan_remote_handler( .await { Ok((index_part, index_generation, _index_mtime)) => { - tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", - index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn()); + tracing::info!( + "Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", + index_part.layer_metadata.len(), + index_part.metadata.disk_consistent_lsn() + ); generation = std::cmp::max(generation, index_generation); } Err(DownloadError::NotFound) => { // This is normal for tenants that were created with multiple shards: they have an unsharded path // containing the timeline's initdb tarball but no index. Otherwise it is a bit strange. - tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping"); + tracing::info!( + "Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping" + ); continue; } Err(e) => { @@ -3172,6 +3225,7 @@ async fn post_top_tenants( match order_by { TenantSorting::ResidentSize => sizes.resident_size, TenantSorting::MaxLogicalSize => sizes.max_logical_size, + TenantSorting::MaxLogicalSizePerShard => sizes.max_logical_size_per_shard, } } @@ -3284,7 +3338,7 @@ async fn put_tenant_timeline_import_basebackup( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - let timeline = tenant + let (timeline, timeline_ctx) = tenant .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) .map_err(ApiError::InternalServerError) .await?; @@ -3303,7 +3357,13 @@ async fn put_tenant_timeline_import_basebackup( info!("importing basebackup"); timeline - .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx) + .import_basebackup_from_tar( + tenant.clone(), + &mut body, + base_lsn, + broker_client, + &timeline_ctx, + ) .await .map_err(ApiError::InternalServerError)?; @@ -3343,6 +3403,7 @@ async fn put_tenant_timeline_import_wal( let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?; + let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build(); let mut body = StreamReader::new(request.into_body().map(|res| { res.map_err(|error| { @@ -3431,7 +3492,9 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow anyhow::bail!("unexpected non-zero bytes after the tar archive"); } if trailing_bytes % 512 != 0 { - anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); + anyhow::bail!( + "unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive" + ); } Ok(()) } @@ -3657,6 +3720,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part", + |r| api_handler(r, timeline_patch_index_part_handler), + ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease", |r| api_handler(r, lsn_lease_handler), diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index a73fa5cec8..6dd005de50 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -4,14 +4,22 @@ //! use std::path::{Path, PathBuf}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{Context, Result, bail, ensure}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; use pageserver_api::key::rel_block_to_key; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::relfile_utils::*; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{ + BLCKSZ, ControlFileData, DBState_DB_SHUTDOWNED, Oid, WAL_SEGMENT_SIZE, XLogFileName, + pg_constants, +}; use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; use tracing::*; +use utils::lsn::Lsn; use wal_decoder::models::InterpretedWalRecord; use walkdir::WalkDir; @@ -20,16 +28,6 @@ use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; use crate::walingest::WalIngest; -use pageserver_api::reltag::{RelTag, SlruKind}; -use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::*; -use postgres_ffi::waldecoder::WalStreamDecoder; -use postgres_ffi::ControlFileData; -use postgres_ffi::DBState_DB_SHUTDOWNED; -use postgres_ffi::Oid; -use postgres_ffi::XLogFileName; -use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; -use utils::lsn::Lsn; // Returns checkpoint LSN from controlfile pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result { diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 491c9fb96c..6cfecef0cf 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -1,4 +1,5 @@ -use std::{num::NonZeroUsize, sync::Arc}; +use std::num::NonZeroUsize; +use std::sync::Arc; #[derive(Debug, PartialEq, Eq, Clone)] pub enum L0FlushConfig { diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index f43cd08cf7..8373d0bd87 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -15,7 +15,8 @@ pub mod l0_flush; extern crate hyper0 as hyper; -use futures::{stream::FuturesUnordered, StreamExt}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; pub use pageserver_api::keyspace; use tokio_util::sync::CancellationToken; mod assert_u64_eq_usize; @@ -35,10 +36,8 @@ pub mod walredo; use camino::Utf8Path; use deletion_queue::DeletionQueue; -use tenant::{ - mgr::{BackgroundPurges, TenantManager}, - secondary, -}; +use tenant::mgr::{BackgroundPurges, TenantManager}; +use tenant::secondary; use tracing::{info, info_span}; /// Current storage format version @@ -65,6 +64,7 @@ pub struct CancellableTask { pub cancel: CancellationToken, } pub struct HttpEndpointListener(pub CancellableTask); +pub struct HttpsEndpointListener(pub CancellableTask); pub struct ConsumptionMetricsTasks(pub CancellableTask); pub struct DiskUsageEvictionTask(pub CancellableTask); impl CancellableTask { @@ -78,6 +78,7 @@ impl CancellableTask { #[allow(clippy::too_many_arguments)] pub async fn shutdown_pageserver( http_listener: HttpEndpointListener, + https_listener: Option, page_service: page_service::Listener, consumption_metrics_worker: ConsumptionMetricsTasks, disk_usage_eviction_task: Option, @@ -214,6 +215,15 @@ pub async fn shutdown_pageserver( ) .await; + if let Some(https_listener) = https_listener { + timed( + https_listener.0.shutdown(), + "shutdown https", + Duration::from_secs(1), + ) + .await; + } + // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. @@ -350,9 +360,10 @@ async fn timed_after_cancellation( #[cfg(test)] mod timed_tests { - use super::timed; use std::time::Duration; + use super::timed; + #[tokio::test] async fn timed_completes_when_inner_future_completes() { // A future that completes on time should have its result returned diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index e1c26b0684..f7afaae068 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -10,11 +10,11 @@ use std::time::{Duration, Instant}; use enum_map::{Enum as _, EnumMap}; use futures::Future; use metrics::{ + Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, - Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, - IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::config::{ @@ -24,9 +24,8 @@ use pageserver_api::config::{ use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; use pin_project_lite::pin_project; -use postgres_backend::{is_expected_io_error, QueryError}; +use postgres_backend::{QueryError, is_expected_io_error}; use pq_proto::framed::ConnectionError; - use strum::{EnumCount, IntoEnumIterator as _, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use utils::id::TimelineId; @@ -35,12 +34,12 @@ use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; use crate::pgdatadir_mapping::DatadirModificationStats; use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; use crate::tenant::layer_map::LayerMap; use crate::tenant::mgr::TenantSlot; use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc}; use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::throttle::ThrottleResult; -use crate::tenant::Timeline; /// Prometheus histogram buckets (in seconds) for operations in the critical /// path. In other words, operations that directly affect that latency of user @@ -144,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_per_read_batch_global", + "Layers visited to serve a single read batch (read amplification), regardless of number of reads.", + vec![ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 + ], + ) + .expect("failed to define a metric") +}); + +pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_per_read_amortized_global", + "Layers visited to serve a single read (read amplification). Amortized across a batch: \ + all visited layers are divided by number of reads.", + vec![ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 + ], + ) + .expect("failed to define a metric") +}); + pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { // We expect this to be low because of Postgres checkpoints. Let's see if that holds. register_histogram!( @@ -363,7 +385,7 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy = pub(crate) mod page_cache_eviction_metrics { use std::num::NonZeroUsize; - use metrics::{register_int_counter_vec, IntCounter, IntCounterVec}; + use metrics::{IntCounter, IntCounterVec, register_int_counter_vec}; use once_cell::sync::Lazy; #[derive(Clone, Copy)] @@ -443,12 +465,40 @@ pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) { pub(crate) static WAIT_LSN_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wait_lsn_seconds", - "Time spent waiting for WAL to arrive", + "Time spent waiting for WAL to arrive. Updated on completion of the wait_lsn operation.", CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); +pub(crate) static WAIT_LSN_START_FINISH_COUNTERPAIR: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_wait_lsn_started_count", + "Number of wait_lsn operations started.", + "pageserver_wait_lsn_finished_count", + "Number of wait_lsn operations finished.", + &["tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + +pub(crate) static WAIT_LSN_IN_PROGRESS_MICROS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_wait_lsn_in_progress_micros", + "Time spent waiting for WAL to arrive, by timeline_id. Updated periodically while waiting.", + &["tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + +pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_wait_lsn_in_progress_micros_global", + "Time spent waiting for WAL to arrive, globally. Updated periodically while waiting." + ) + .expect("failed to define a metric") +}); + static FLUSH_WAIT_UPLOAD_TIME: Lazy = Lazy::new(|| { register_gauge_vec!( "pageserver_flush_wait_upload_seconds", @@ -722,7 +772,7 @@ pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy = Lazy::new(|| { }); pub(crate) mod initial_logical_size { - use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; + use metrics::{IntCounter, IntCounterVec, register_int_counter, register_int_counter_vec}; use once_cell::sync::Lazy; pub(crate) struct StartCalculation(IntCounterVec); @@ -1105,12 +1155,17 @@ impl EvictionsWithLowResidenceDuration { // - future "drop panick => abort" // // so just nag: (the error has the labels) - tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}"); + tracing::warn!( + "failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}" + ); } Ok(()) => { // to help identify cases where we double-remove the same values, let's log all // deletions? - tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source); + tracing::info!( + "removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", + self.data_source + ); } } } @@ -1200,11 +1255,24 @@ impl StorageIoTime { pub(crate) static STORAGE_IO_TIME_METRIC: Lazy = Lazy::new(StorageIoTime::new); -const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; +#[derive(Clone, Copy)] +#[repr(usize)] +enum StorageIoSizeOperation { + Read, + Write, +} + +impl StorageIoSizeOperation { + const VARIANTS: &'static [&'static str] = &["read", "write"]; + + fn as_str(&self) -> &'static str { + Self::VARIANTS[*self as usize] + } +} // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1 -pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { - register_int_gauge_vec!( +static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( "pageserver_io_operations_bytes_total", "Total amount of bytes read/written in IO operations", &["operation", "tenant_id", "shard_id", "timeline_id"] @@ -1212,6 +1280,34 @@ pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +#[derive(Clone, Debug)] +pub(crate) struct StorageIoSizeMetrics { + pub read: UIntGauge, + pub write: UIntGauge, +} + +impl StorageIoSizeMetrics { + pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self { + let read = STORAGE_IO_SIZE + .get_metric_with_label_values(&[ + StorageIoSizeOperation::Read.as_str(), + tenant_id, + shard_id, + timeline_id, + ]) + .unwrap(); + let write = STORAGE_IO_SIZE + .get_metric_with_label_values(&[ + StorageIoSizeOperation::Write.as_str(), + tenant_id, + shard_id, + timeline_id, + ]) + .unwrap(); + Self { read, write } + } +} + #[cfg(not(test))] pub(crate) mod virtual_file_descriptor_cache { use super::*; @@ -2762,7 +2858,6 @@ impl StorageTimeMetrics { } } -#[derive(Debug)] pub(crate) struct TimelineMetrics { tenant_id: String, shard_id: String, @@ -2794,6 +2889,9 @@ pub(crate) struct TimelineMetrics { /// Number of valid LSN leases. pub valid_lsn_lease_count_gauge: UIntGauge, pub wal_records_received: IntCounter, + pub storage_io_size: StorageIoSizeMetrics, + pub wait_lsn_in_progress_micros: GlobalAndPerTenantIntCounter, + pub wait_lsn_start_finish_counterpair: IntCounterPair, shutdown: std::sync::atomic::AtomicBool, } @@ -2929,6 +3027,19 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id); + + let wait_lsn_in_progress_micros = GlobalAndPerTenantIntCounter { + global: WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS.clone(), + per_tenant: WAIT_LSN_IN_PROGRESS_MICROS + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(), + }; + + let wait_lsn_start_finish_counterpair = WAIT_LSN_START_FINISH_COUNTERPAIR + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + TimelineMetrics { tenant_id, shard_id, @@ -2958,8 +3069,11 @@ impl TimelineMetrics { evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, ), + storage_io_size, valid_lsn_lease_count_gauge, wal_records_received, + wait_lsn_in_progress_micros, + wait_lsn_start_finish_counterpair, shutdown: std::sync::atomic::AtomicBool::default(), } } @@ -3148,10 +3262,19 @@ impl TimelineMetrics { ]); } - for op in STORAGE_IO_SIZE_OPERATIONS { + for op in StorageIoSizeOperation::VARIANTS { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } + let _ = + WAIT_LSN_IN_PROGRESS_MICROS.remove_label_values(&[tenant_id, shard_id, timeline_id]); + + { + let mut res = [Ok(()), Ok(())]; + WAIT_LSN_START_FINISH_COUNTERPAIR + .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id]); + } + let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), tenant_id, @@ -3574,12 +3697,10 @@ impl>, O, E> Future for MeasuredRemoteOp { } pub mod tokio_epoll_uring { - use std::{ - collections::HashMap, - sync::{Arc, Mutex}, - }; + use std::collections::HashMap; + use std::sync::{Arc, Mutex}; - use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge}; + use metrics::{Histogram, LocalHistogram, UIntGauge, register_histogram, register_int_counter}; use once_cell::sync::Lazy; /// Shared storage for tokio-epoll-uring thread local metrics. @@ -3588,7 +3709,9 @@ pub mod tokio_epoll_uring { let slots_submission_queue_depth = register_histogram!( "pageserver_tokio_epoll_uring_slots_submission_queue_depth", "The slots waiters queue depth of each tokio_epoll_uring system", - vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + vec![ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 + ], ) .expect("failed to define a metric"); ThreadLocalMetricsStorage { @@ -3764,27 +3887,29 @@ pub mod tokio_epoll_uring { }); } +pub(crate) struct GlobalAndPerTenantIntCounter { + global: IntCounter, + per_tenant: IntCounter, +} + +impl GlobalAndPerTenantIntCounter { + #[inline(always)] + pub(crate) fn inc(&self) { + self.inc_by(1) + } + #[inline(always)] + pub(crate) fn inc_by(&self, n: u64) { + self.global.inc_by(n); + self.per_tenant.inc_by(n); + } +} + pub(crate) mod tenant_throttling { - use metrics::{register_int_counter_vec, IntCounter}; + use metrics::register_int_counter_vec; use once_cell::sync::Lazy; use utils::shard::TenantShardId; - pub(crate) struct GlobalAndPerTenantIntCounter { - global: IntCounter, - per_tenant: IntCounter, - } - - impl GlobalAndPerTenantIntCounter { - #[inline(always)] - pub(crate) fn inc(&self) { - self.inc_by(1) - } - #[inline(always)] - pub(crate) fn inc_by(&self, n: u64) { - self.global.inc_by(n); - self.per_tenant.inc_by(n); - } - } + use super::GlobalAndPerTenantIntCounter; pub(crate) struct Metrics { pub(super) count_accounted_start: GlobalAndPerTenantIntCounter, @@ -4030,6 +4155,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { &CIRCUIT_BREAKERS_BROKEN, &CIRCUIT_BREAKERS_UNBROKEN, &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL, + &WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS, ] .into_iter() .for_each(|c| { @@ -4070,6 +4196,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { // histograms [ &LAYERS_PER_READ_GLOBAL, + &LAYERS_PER_READ_BATCH_GLOBAL, + &LAYERS_PER_READ_AMORTIZED_GLOBAL, &DELTAS_PER_READ_GLOBAL, &WAIT_LSN_TIME, &WAL_REDO_TIME, diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 45bf02362a..984dd125a9 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -67,23 +67,18 @@ //! mapping is automatically removed and the slot is marked free. //! -use std::{ - collections::{hash_map::Entry, HashMap}, - sync::{ - atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, - Arc, Weak, - }, - time::Duration, -}; +use std::collections::HashMap; +use std::collections::hash_map::Entry; +use std::sync::atomic::{AtomicU8, AtomicU64, AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::time::Duration; use anyhow::Context; use once_cell::sync::OnceCell; -use crate::{ - context::RequestContext, - metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics}, - virtual_file::{IoBufferMut, IoPageSlice}, -}; +use crate::context::RequestContext; +use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics}; +use crate::virtual_file::{IoBufferMut, IoPageSlice}; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; @@ -168,11 +163,7 @@ impl Slot { let count_res = self.usage_count .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| { - if val == 0 { - None - } else { - Some(val - 1) - } + if val == 0 { None } else { Some(val - 1) } }); match count_res { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b9b8e32753..94571cbaaa 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1,7 +1,15 @@ //! The Page Service listens for client connections and serves their GetPage@LSN //! requests. -use anyhow::{bail, Context}; +use std::borrow::Cow; +use std::num::NonZeroUsize; +use std::os::fd::AsRawFd; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant, SystemTime}; +use std::{io, str}; + +use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; use futures::FutureExt; @@ -11,71 +19,58 @@ use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedExecutionStrategy, }; -use pageserver_api::models::{self, TenantState}; +use pageserver_api::key::rel_block_to_key; use pageserver_api::models::{ - PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, + self, PageTraceEvent, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, - PagestreamProtocolVersion, PagestreamRequest, + PagestreamProtocolVersion, PagestreamRequest, TenantState, }; +use pageserver_api::reltag::SlruKind; use pageserver_api::shard::TenantShardId; use postgres_backend::{ - is_expected_io_error, AuthType, PostgresBackend, PostgresBackendReader, QueryError, + AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error, }; +use postgres_ffi::BLCKSZ; +use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use pq_proto::framed::ConnectionError; -use pq_proto::FeStartupPacket; -use pq_proto::{BeMessage, FeMessage, RowDescriptor}; -use std::borrow::Cow; -use std::io; -use std::num::NonZeroUsize; -use std::str; -use std::str::FromStr; -use std::sync::Arc; -use std::time::SystemTime; -use std::time::{Duration, Instant}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor}; use strum_macros::IntoStaticStr; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::io::{AsyncWriteExt, BufWriter}; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::logging::warn_slow; +use utils::auth::{Claims, Scope, SwappableJwtAuth}; +use utils::failpoint_support; +use utils::id::{TenantId, TimelineId}; +use utils::logging::log_slow; +use utils::lsn::Lsn; +use utils::simple_rcu::RcuReadGuard; use utils::sync::gate::{Gate, GateGuard}; use utils::sync::spsc_fold; -use utils::{ - auth::{Claims, Scope, SwappableJwtAuth}, - id::{TenantId, TimelineId}, - lsn::Lsn, - simple_rcu::RcuReadGuard, -}; use crate::auth::check_permission; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{self, SmgrOpTimer}; -use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; +use crate::metrics::{ + self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer, + TimelineMetrics, +}; use crate::pgdatadir_mapping::Version; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; -use crate::task_mgr::TaskKind; -use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME}; -use crate::tenant::mgr::ShardSelector; -use crate::tenant::mgr::TenantManager; -use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult}; +use crate::span::{ + debug_assert_current_span_has_tenant_and_timeline_id, + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, +}; +use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind}; +use crate::tenant::mgr::{ + GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager, +}; use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::{self, WaitLsnError}; -use crate::tenant::GetTimelineError; -use crate::tenant::PageReconstructError; -use crate::tenant::Timeline; +use crate::tenant::{GetTimelineError, PageReconstructError, Timeline}; use crate::{basebackup, timed_after_cancellation}; -use pageserver_api::key::rel_block_to_key; -use pageserver_api::models::PageTraceEvent; -use pageserver_api::reltag::SlruKind; -use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; -use postgres_ffi::BLCKSZ; -use std::os::fd::AsRawFd; /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which /// is not yet in state [`TenantState::Active`]. @@ -83,8 +78,8 @@ use std::os::fd::AsRawFd; /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); -/// Threshold at which to log a warning about slow GetPage requests. -const WARN_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); +/// Threshold at which to log slow GetPage requests. +const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); /////////////////////////////////////////////////////////////////////////////// @@ -398,10 +393,6 @@ impl TimelineHandles { .await .map_err(|e| match e { timeline::handle::GetError::TenantManager(e) => e, - timeline::handle::GetError::TimelineGateClosed => { - trace!("timeline gate closed"); - GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) - } timeline::handle::GetError::PerTimelineStateShutDown => { trace!("per-timeline state shut down"); GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) @@ -428,24 +419,36 @@ pub(crate) struct TenantManagerTypes; impl timeline::handle::Types for TenantManagerTypes { type TenantManagerError = GetActiveTimelineError; type TenantManager = TenantManagerWrapper; - type Timeline = Arc; + type Timeline = TenantManagerCacheItem; } -impl timeline::handle::ArcTimeline for Arc { - fn gate(&self) -> &utils::sync::gate::Gate { - &self.gate - } +pub(crate) struct TenantManagerCacheItem { + pub(crate) timeline: Arc, + // allow() for cheap propagation through RequestContext inside a task + #[allow(clippy::redundant_allocation)] + pub(crate) metrics: Arc>, + #[allow(dead_code)] // we store it to keep the gate open + pub(crate) gate_guard: GateGuard, +} +impl std::ops::Deref for TenantManagerCacheItem { + type Target = Arc; + fn deref(&self) -> &Self::Target { + &self.timeline + } +} + +impl timeline::handle::Timeline for TenantManagerCacheItem { fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId { - Timeline::shard_timeline_id(self) + Timeline::shard_timeline_id(&self.timeline) } fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState { - &self.handles + &self.timeline.handles } fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity { - Timeline::get_shard_identity(self) + Timeline::get_shard_identity(&self.timeline) } } @@ -454,7 +457,7 @@ impl timeline::handle::TenantManager for TenantManagerWrappe &self, timeline_id: TimelineId, shard_selector: ShardSelector, - ) -> Result, GetActiveTimelineError> { + ) -> Result { let tenant_id = self.tenant_id.get().expect("we set this in get()"); let timeout = ACTIVE_TENANT_TIMEOUT; let wait_start = Instant::now(); @@ -497,7 +500,23 @@ impl timeline::handle::TenantManager for TenantManagerWrappe let timeline = tenant_shard .get_timeline(timeline_id, true) .map_err(GetActiveTimelineError::Timeline)?; - Ok(timeline) + + let gate_guard = match timeline.gate.enter() { + Ok(guard) => guard, + Err(_) => { + return Err(GetActiveTimelineError::Timeline( + GetTimelineError::ShuttingDown, + )); + } + }; + + let metrics = Arc::new(Arc::clone(&timeline.metrics)); + + Ok(TenantManagerCacheItem { + timeline, + metrics, + gate_guard, + }) } } @@ -985,7 +1004,7 @@ impl PageServerHandler { Ok(BatchedFeMessage::GetPage { span: _, shard: accum_shard, - pages: ref mut accum_pages, + pages: accum_pages, effective_request_lsn: accum_lsn, }), BatchedFeMessage::GetPage { @@ -1086,132 +1105,19 @@ impl PageServerHandler { batch }; - // invoke handler function - let (mut handler_results, span): ( - Vec>, - _, - ) = match batch { - BatchedFeMessage::Exists { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::exists"); - ( - vec![self - .handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - BatchedFeMessage::Nblocks { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::nblocks"); - ( - vec![self - .handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - BatchedFeMessage::GetPage { - span, - shard, - effective_request_lsn, - pages, - } => { - fail::fail_point!("ps::handle-pagerequest-message::getpage"); - ( - { - let npages = pages.len(); - trace!(npages, "handling getpage request"); - let res = self - .handle_get_page_at_lsn_request_batched( - &*shard.upgrade()?, - effective_request_lsn, - pages, - io_concurrency, - ctx, - ) - .instrument(span.clone()) - .await; - assert_eq!(res.len(), npages); - res - }, - span, - ) - } - BatchedFeMessage::DbSize { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::dbsize"); - ( - vec![self - .handle_db_size_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - BatchedFeMessage::GetSlruSegment { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); - ( - vec![self - .handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - #[cfg(feature = "testing")] - BatchedFeMessage::Test { - span, - shard, - requests, - } => { - fail::fail_point!("ps::handle-pagerequest-message::test"); - ( - { - let npages = requests.len(); - trace!(npages, "handling getpage request"); - let res = self - .handle_test_request_batch(&*shard.upgrade()?, requests, ctx) - .instrument(span.clone()) - .await; - assert_eq!(res.len(), npages); - res - }, - span, - ) - } - BatchedFeMessage::RespondError { span, error } => { - // We've already decided to respond with an error, so we don't need to - // call the handler. - (vec![Err(error)], span) - } + // Dispatch the batch to the appropriate request handler. + let log_slow_name = batch.as_static_str(); + let (mut handler_results, span) = { + // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and + // won't fit on the stack. + let mut boxpinned = + Box::pin(self.pagestream_dispatch_batched_message(batch, io_concurrency, ctx)); + log_slow( + log_slow_name, + LOG_SLOW_GETPAGE_THRESHOLD, + boxpinned.as_mut(), + ) + .await? }; // We purposefully don't count flush time into the smgr operation timer. @@ -1298,6 +1204,8 @@ impl PageServerHandler { &response_msg.serialize(protocol_version), ))?; + failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel); + // what we want to do let socket_fd = pgb_writer.socket_fd; let flush_fut = pgb_writer.flush(); @@ -1330,6 +1238,163 @@ impl PageServerHandler { Ok(()) } + /// Helper which dispatches a batched message to the appropriate handler. + /// Returns a vec of results, along with the extracted trace span. + async fn pagestream_dispatch_batched_message( + &mut self, + batch: BatchedFeMessage, + io_concurrency: IoConcurrency, + ctx: &RequestContext, + ) -> Result< + ( + Vec>, + Span, + ), + QueryError, + > { + macro_rules! upgrade_handle_and_set_context { + ($shard:ident) => {{ + let weak_handle = &$shard; + let handle = weak_handle.upgrade()?; + let ctx = ctx.with_scope_page_service_pagestream(&handle); + (handle, ctx) + }}; + } + Ok(match batch { + BatchedFeMessage::Exists { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::exists"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); + ( + vec![ + self.handle_get_rel_exists_request(&shard, &req, &ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::Nblocks { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); + ( + vec![ + self.handle_get_nblocks_request(&shard, &req, &ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages, + } => { + fail::fail_point!("ps::handle-pagerequest-message::getpage"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); + ( + { + let npages = pages.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_get_page_at_lsn_request_batched( + &shard, + effective_request_lsn, + pages, + io_concurrency, + &ctx, + ) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::DbSize { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); + ( + vec![ + self.handle_db_size_request(&shard, &req, &ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::GetSlruSegment { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); + ( + vec![ + self.handle_get_slru_segment_request(&shard, &req, &ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + #[cfg(feature = "testing")] + BatchedFeMessage::Test { + span, + shard, + requests, + } => { + fail::fail_point!("ps::handle-pagerequest-message::test"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); + ( + { + let npages = requests.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_test_request_batch(&shard, requests, &ctx) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::RespondError { span, error } => { + // We've already decided to respond with an error, so we don't need to + // call the handler. + (vec![Err(error)], span) + } + }) + } + /// Pagestream sub-protocol handler. /// /// It is a simple request-response protocol inside a COPYBOTH session. @@ -1473,19 +1538,16 @@ impl PageServerHandler { } }; - let result = warn_slow( - msg.as_static_str(), - WARN_SLOW_GETPAGE_THRESHOLD, - self.pagesteam_handle_batched_message( + let result = self + .pagesteam_handle_batched_message( pgb_writer, msg, io_concurrency.clone(), &cancel, protocol_version, ctx, - ), - ) - .await; + ) + .await; match result { Ok(()) => {} Err(e) => break e, @@ -1649,17 +1711,13 @@ impl PageServerHandler { return Err(e); } }; - warn_slow( - batch.as_static_str(), - WARN_SLOW_GETPAGE_THRESHOLD, - self.pagesteam_handle_batched_message( - pgb_writer, - batch, - io_concurrency.clone(), - &cancel, - protocol_version, - &ctx, - ), + self.pagesteam_handle_batched_message( + pgb_writer, + batch, + io_concurrency.clone(), + &cancel, + protocol_version, + &ctx, ) .await?; } @@ -2083,6 +2141,7 @@ impl PageServerHandler { // TODO: passthrough the error site to the final error message? BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)), BasebackupError::Server(e) => QueryError::Other(e), + BasebackupError::Shutdown => QueryError::Shutdown, } } @@ -2095,9 +2154,12 @@ impl PageServerHandler { .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; set_tracing_field_shard_id(&timeline); + let ctx = ctx.with_scope_timeline(&timeline); if timeline.is_archived() == Some(true) { - tracing::info!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it."); + tracing::info!( + "timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it." + ); return Err(QueryError::NotFound("timeline is archived".into())); } @@ -2110,7 +2172,7 @@ impl PageServerHandler { lsn, crate::tenant::timeline::WaitLsnWaiter::PageService, crate::tenant::timeline::WaitLsnTimeout::Default, - ctx, + &ctx, ) .await?; timeline @@ -2136,7 +2198,7 @@ impl PageServerHandler { prev_lsn, full_backup, replica, - ctx, + &ctx, ) .await .map_err(map_basebackup_error)?; @@ -2159,7 +2221,7 @@ impl PageServerHandler { prev_lsn, full_backup, replica, - ctx, + &ctx, ) .await .map_err(map_basebackup_error)?; @@ -2176,7 +2238,7 @@ impl PageServerHandler { prev_lsn, full_backup, replica, - ctx, + &ctx, ) .await .map_err(map_basebackup_error)?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index d0e2dab042..4685f9383b 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,6 +6,37 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! +use std::collections::{BTreeMap, HashMap, HashSet, hash_map}; +use std::ops::{ControlFlow, Range}; + +use anyhow::{Context, ensure}; +use bytes::{Buf, Bytes, BytesMut}; +use enum_map::Enum; +use itertools::Itertools; +use pageserver_api::key::{ + AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists, + TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, + rel_size_to_key, rel_tag_sparse_key, rel_tag_sparse_key_range, relmap_file_key, + repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, + slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, +}; +use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::models::RelSizeMigration; +use pageserver_api::record::NeonWalRecord; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; +use pageserver_api::shard::ShardIdentity; +use pageserver_api::value::Value; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId}; +use serde::{Deserialize, Serialize}; +use strum::IntoEnumIterator; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, trace, warn}; +use utils::bin_ser::{BeSer, DeserializeError}; +use utils::lsn::Lsn; +use utils::pausable_failpoint; +use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; + use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; use crate::context::RequestContext; @@ -19,37 +50,6 @@ use crate::span::{ }; use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::GetVectoredError; -use anyhow::{ensure, Context}; -use bytes::{Buf, Bytes, BytesMut}; -use enum_map::Enum; -use itertools::Itertools; -use pageserver_api::key::{ - dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, - rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range, - slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, - twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY, - CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, -}; -use pageserver_api::key::{rel_tag_sparse_key, Key}; -use pageserver_api::keyspace::SparseKeySpace; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::value::Value; -use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::BLCKSZ; -use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId}; -use serde::{Deserialize, Serialize}; -use std::collections::{hash_map, BTreeMap, HashMap, HashSet}; -use std::ops::ControlFlow; -use std::ops::Range; -use strum::IntoEnumIterator; -use tokio_util::sync::CancellationToken; -use tracing::{debug, info, trace, warn}; -use utils::bin_ser::DeserializeError; -use utils::pausable_failpoint; -use utils::{bin_ser::BeSer, lsn::Lsn}; -use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. pub const MAX_AUX_FILE_DELTAS: usize = 1024; @@ -327,16 +327,16 @@ impl Timeline { let clone = match &res { Ok(buf) => Ok(buf.clone()), Err(err) => Err(match err { - PageReconstructError::Cancelled => { - PageReconstructError::Cancelled - } + PageReconstructError::Cancelled => PageReconstructError::Cancelled, - x @ PageReconstructError::Other(_) | - x @ PageReconstructError::AncestorLsnTimeout(_) | - x @ PageReconstructError::WalRedo(_) | - x @ PageReconstructError::MissingKey(_) => { - PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}")) - }, + x @ PageReconstructError::Other(_) + | x @ PageReconstructError::AncestorLsnTimeout(_) + | x @ PageReconstructError::WalRedo(_) + | x @ PageReconstructError::MissingKey(_) => { + PageReconstructError::Other(anyhow::anyhow!( + "there was more than one request for this key in the batch, error logged once: {x:?}" + )) + } }), }; @@ -355,23 +355,23 @@ impl Timeline { // this whole `match` is a lot like `From for PageReconstructError` // but without taking ownership of the GetVectoredError let err = match &err { - GetVectoredError::Cancelled => { - Err(PageReconstructError::Cancelled) - } + GetVectoredError::Cancelled => Err(PageReconstructError::Cancelled), // TODO: restructure get_vectored API to make this error per-key GetVectoredError::MissingKey(err) => { - Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}"))) + Err(PageReconstructError::Other(anyhow::anyhow!( + "whole vectored get request failed because one or more of the requested keys were missing: {err:?}" + ))) } // TODO: restructure get_vectored API to make this error per-key GetVectoredError::GetReadyAncestorError(err) => { - Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"))) + Err(PageReconstructError::Other(anyhow::anyhow!( + "whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}" + ))) } // TODO: restructure get_vectored API to make this error per-key - GetVectoredError::Other(err) => { - Err(PageReconstructError::Other( - anyhow::anyhow!("whole vectored get request failed: {err:?}"), - )) - } + GetVectoredError::Other(err) => Err(PageReconstructError::Other( + anyhow::anyhow!("whole vectored get request failed: {err:?}"), + )), // TODO: we can prevent this error class by moving this check into the type system GetVectoredError::InvalidLsn(e) => { Err(anyhow::anyhow!("invalid LSN: {e:?}").into()) @@ -379,10 +379,7 @@ impl Timeline { // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS // TODO: we can prevent this error class by moving this check into the type system GetVectoredError::Oversized(err) => { - Err(anyhow::anyhow!( - "batching oversized: {err:?}" - ) - .into()) + Err(anyhow::anyhow!("batching oversized: {err:?}").into()) } }; @@ -496,7 +493,9 @@ impl Timeline { // Otherwise, read the old reldir keyspace. // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2. - if self.get_rel_size_v2_enabled() { + if let RelSizeMigration::Migrated | RelSizeMigration::Migrating = + self.get_rel_size_v2_status() + { // fetch directory listing (new) let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum); let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?) @@ -548,7 +547,7 @@ impl Timeline { forknum: *forknum, })); - if !self.get_rel_size_v2_enabled() { + if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() { return Ok(rels_v1); } @@ -603,28 +602,36 @@ impl Timeline { let n_blocks = self .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) .await?; - let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); - for blkno in 0..n_blocks { - let block = self - .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx) - .await?; - segment.extend_from_slice(&block[..BLCKSZ as usize]); - } - Ok(segment.freeze()) - } - /// Look up given SLRU page version. - pub(crate) async fn get_slru_page_at_lsn( - &self, - kind: SlruKind, - segno: u32, - blknum: BlockNumber, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result { - assert!(self.tenant_shard_id.is_shard_zero()); - let key = slru_block_to_key(kind, segno, blknum); - self.get(key, lsn, ctx).await + let keyspace = KeySpace::single( + slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks), + ); + + let batches = keyspace.partition( + self.get_shard_identity(), + Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + ); + + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| PageReconstructError::Cancelled)?, + ); + + let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); + for batch in batches.parts { + let blocks = self + .get_vectored(batch, lsn, io_concurrency.clone(), ctx) + .await?; + + for (_key, block) in blocks { + let block = block?; + segment.extend_from_slice(&block[..BLCKSZ as usize]); + } + } + + Ok(segment.freeze()) } /// Get size of an SLRU segment @@ -715,7 +722,10 @@ impl Timeline { { Ok(res) => res, Err(PageReconstructError::MissingKey(e)) => { - warn!("Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e); + warn!( + "Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", + e + ); // Return that we didn't find any requests smaller than the LSN, and logging the error. return Ok(LsnForTimestamp::Past(min_lsn)); } @@ -830,19 +840,41 @@ impl Timeline { let nblocks = self .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx) .await?; - for blknum in (0..nblocks).rev() { - let clog_page = self - .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx) + + let keyspace = KeySpace::single( + slru_block_to_key(SlruKind::Clog, segno, 0) + ..slru_block_to_key(SlruKind::Clog, segno, nblocks), + ); + + let batches = keyspace.partition( + self.get_shard_identity(), + Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + ); + + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| PageReconstructError::Cancelled)?, + ); + + for batch in batches.parts.into_iter().rev() { + let blocks = self + .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx) .await?; - if clog_page.len() == BLCKSZ as usize + 8 { - let mut timestamp_bytes = [0u8; 8]; - timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); - let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); + for (_key, clog_page) in blocks.into_iter().rev() { + let clog_page = clog_page?; - match f(timestamp) { - ControlFlow::Break(b) => return Ok(b), - ControlFlow::Continue(()) => (), + if clog_page.len() == BLCKSZ as usize + 8 { + let mut timestamp_bytes = [0u8; 8]; + timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); + let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); + + match f(timestamp) { + ControlFlow::Break(b) => return Ok(b), + ControlFlow::Continue(()) => (), + } } } } @@ -1053,6 +1085,8 @@ impl Timeline { ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) }); + // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; let dbdir = DbDirectory::des(&buf)?; @@ -1719,6 +1753,35 @@ impl DatadirModification<'_> { Ok(()) } + /// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that + /// we enable it, we also need to persist it in `index_part.json`. + pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result { + let status = self.tline.get_rel_size_v2_status(); + let config = self.tline.get_rel_size_v2_enabled(); + match (config, status) { + (false, RelSizeMigration::Legacy) => { + // tenant config didn't enable it and we didn't write any reldir_v2 key yet + Ok(false) + } + (false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => { + // index_part already persisted that the timeline has enabled rel_size_v2 + Ok(true) + } + (true, RelSizeMigration::Legacy) => { + // The first time we enable it, we need to persist it in `index_part.json` + self.tline + .update_rel_size_v2_status(RelSizeMigration::Migrating)?; + tracing::info!("enabled rel_size_v2"); + Ok(true) + } + (true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => { + // index_part already persisted that the timeline has enabled rel_size_v2 + // and we don't need to do anything + Ok(true) + } + } + } + /// Store a relmapper file (pg_filenode.map) in the repository pub async fn put_relmap_file( &mut self, @@ -1727,6 +1790,8 @@ impl DatadirModification<'_> { img: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { + let v2_enabled = self.maybe_enable_rel_size_v2()?; + // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; let mut dbdir = DbDirectory::des(&buf)?; @@ -1747,7 +1812,7 @@ impl DatadirModification<'_> { })?; self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); - if self.tline.get_rel_size_v2_enabled() { + if v2_enabled { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); } @@ -1899,12 +1964,12 @@ impl DatadirModification<'_> { .context("deserialize db")? }; - // Add the new relation to the rel directory entry, and write it back - if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); - } + let v2_enabled = self.maybe_enable_rel_size_v2()?; - if self.tline.get_rel_size_v2_enabled() { + if v2_enabled { + if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) { + return Err(RelationError::AlreadyExists); + } let sparse_rel_dir_key = rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); // check if the rel_dir_key exists in v2 @@ -1939,6 +2004,10 @@ impl DatadirModification<'_> { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Add(1))); } else { + // Add the new relation to the rel directory entry, and write it back + if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { + return Err(RelationError::AlreadyExists); + } if !dbdir_exists { self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Set(0))) @@ -1952,6 +2021,7 @@ impl DatadirModification<'_> { )), ); } + // Put size let size_key = rel_size_to_key(rel); let buf = nblocks.to_le_bytes(); @@ -2030,6 +2100,7 @@ impl DatadirModification<'_> { drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, ) -> anyhow::Result<()> { + let v2_enabled = self.maybe_enable_rel_size_v2()?; for ((spc_node, db_node), rel_tags) in drop_relations { let dir_key = rel_dir_to_key(spc_node, db_node); let buf = self.get(dir_key, ctx).await?; @@ -2042,7 +2113,7 @@ impl DatadirModification<'_> { .push((DirectoryKind::Rel, MetricsUpdate::Sub(1))); dirty = true; true - } else if self.tline.get_rel_size_v2_enabled() { + } else if v2_enabled { // The rel is not found in the old reldir key, so we need to check the new sparse keyspace. // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion // logic). @@ -2073,7 +2144,7 @@ impl DatadirModification<'_> { // Remove entry from relation size cache self.tline.remove_cached_rel_size(&rel_tag); - // Delete size entry, as well as all blocks + // Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage. self.delete(rel_key_range(rel_tag)); } } @@ -2464,10 +2535,12 @@ impl DatadirModification<'_> { // modifications before ingesting DB create operations, which are the only kind that reads // data pages during ingest. if cfg!(debug_assertions) { - assert!(!self - .pending_data_batch - .as_ref() - .is_some_and(|b| b.updates_key(&key))); + assert!( + !self + .pending_data_batch + .as_ref() + .is_some_and(|b| b.updates_key(&key)) + ); } } @@ -2666,15 +2739,14 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); #[cfg(test)] mod tests { use hex_literal::hex; - use pageserver_api::{models::ShardParameters, shard::ShardStripeSize}; - use utils::{ - id::TimelineId, - shard::{ShardCount, ShardNumber}, - }; + use pageserver_api::models::ShardParameters; + use pageserver_api::shard::ShardStripeSize; + use utils::id::TimelineId; + use utils::shard::{ShardCount, ShardNumber}; use super::*; - - use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION}; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::TenantHarness; /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline #[tokio::test] @@ -2686,7 +2758,7 @@ mod tests { TimelineId::from_array(hex!("11223344556677881122334455667788")); let (tenant, ctx) = harness.load().await; - let tline = tenant + let (tline, ctx) = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; let tline = tline.raw_timeline().unwrap(); diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 4e8be58d58..85c2ed8499 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -73,11 +73,10 @@ impl Statvfs { pub mod mock { use camino::Utf8Path; + pub use pageserver_api::config::statvfs::mock::Behavior; use regex::Regex; use tracing::log::info; - pub use pageserver_api::config::statvfs::mock::Behavior; - pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result { info!("running mocked statvfs"); @@ -85,7 +84,7 @@ pub mod mock { Behavior::Success { blocksize, total_blocks, - ref name_filter, + name_filter, } => { let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); @@ -134,7 +133,7 @@ pub mod mock { } Err(e) => { return Err(anyhow::Error::new(e) - .context(format!("get metadata of {:?}", entry.path()))) + .context(format!("get metadata of {:?}", entry.path()))); } }; total += m.len(); diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index cc93a06ccd..0b71b2cf5b 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -40,15 +40,12 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; +use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; - use tracing::{debug, error, info, warn}; - -use once_cell::sync::Lazy; - use utils::env; use utils::id::TimelineId; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 56718f5294..55b5704d67 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,149 +12,101 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{bail, Context}; +use std::collections::hash_map::Entry; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::fmt::{Debug, Display}; +use std::fs::File; +use std::future::Future; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::{Arc, Mutex, Weak}; +use std::time::{Duration, Instant, SystemTime}; +use std::{fmt, fs}; + +use anyhow::{Context, bail}; use arc_swap::ArcSwap; -use camino::Utf8Path; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; use chrono::NaiveDateTime; use enumset::EnumSet; -use futures::stream::FuturesUnordered; use futures::StreamExt; +use futures::stream::FuturesUnordered; use itertools::Itertools as _; -use pageserver_api::models; -use pageserver_api::models::CompactInfoResponse; -use pageserver_api::models::LsnLease; -use pageserver_api::models::TimelineArchivalState; -use pageserver_api::models::TimelineState; -use pageserver_api::models::TopTenantShardItem; -use pageserver_api::models::WalRedoManagerStatus; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::shard::ShardStripeSize; -use pageserver_api::shard::TenantShardId; -use remote_storage::DownloadError; -use remote_storage::GenericRemoteStorage; -use remote_storage::TimeoutOrCancel; -use remote_timeline_client::manifest::{ - OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION, +use once_cell::sync::Lazy; +pub use pageserver_api::models::TenantState; +use pageserver_api::models::{self, RelSizeMigration}; +use pageserver_api::models::{ + CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem, + WalRedoManagerStatus, }; -use remote_timeline_client::UploadQueueNotReadyError; -use remote_timeline_client::FAILED_REMOTE_OP_RETRIES; -use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD; -use secondary::heatmap::HeatMapTenant; -use secondary::heatmap::HeatMapTimeline; -use std::collections::BTreeMap; -use std::fmt; -use std::future::Future; -use std::sync::atomic::AtomicBool; -use std::sync::Weak; -use std::time::SystemTime; +use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId}; +use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel}; +use remote_timeline_client::index::GcCompactionState; +use remote_timeline_client::manifest::{ + LATEST_TENANT_MANIFEST_VERSION, OffloadedTimelineManifest, TenantManifest, +}; +use remote_timeline_client::{ + FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError, +}; +use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; -use timeline::compaction::CompactionOutcome; -use timeline::compaction::GcCompactionQueue; -use timeline::import_pgdata; -use timeline::offload::offload_timeline; -use timeline::offload::OffloadError; -use timeline::CompactFlags; -use timeline::CompactOptions; -use timeline::CompactionError; -use timeline::PreviousHeatmap; -use timeline::ShutdownMode; +use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; +use timeline::offload::{OffloadError, offload_timeline}; +use timeline::{ + CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata, +}; use tokio::io::BufReader; -use tokio::sync::watch; -use tokio::sync::Notify; +use tokio::sync::{Notify, Semaphore, watch}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use upload_queue::NotInitialized; -use utils::backoff; use utils::circuit_breaker::CircuitBreaker; -use utils::completion; use utils::crashsafe::path_with_suffix_extension; -use utils::failpoint_support; -use utils::fs_ext; -use utils::pausable_failpoint; -use utils::sync::gate::Gate; -use utils::sync::gate::GateGuard; -use utils::timeout::timeout_cancellable; -use utils::timeout::TimeoutCancellableError; +use utils::sync::gate::{Gate, GateGuard}; +use utils::timeout::{TimeoutCancellableError, timeout_cancellable}; use utils::try_rcu::ArcSwapExt; -use utils::zstd::create_zst_tarball; -use utils::zstd::extract_zst_tarball; +use utils::zstd::{create_zst_tarball, extract_zst_tarball}; +use utils::{backoff, completion, failpoint_support, fs_ext, pausable_failpoint}; -use self::config::AttachedLocationConfig; -use self::config::AttachmentMode; -use self::config::LocationConf; -use self::config::TenantConf; +use self::config::{AttachedLocationConfig, AttachmentMode, LocationConf, TenantConf}; use self::metadata::TimelineMetadata; -use self::mgr::GetActiveTenantError; -use self::mgr::GetTenantError; +use self::mgr::{GetActiveTenantError, GetTenantError}; use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest}; use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; -use self::timeline::uninit::TimelineCreateGuard; -use self::timeline::uninit::TimelineExclusionError; -use self::timeline::uninit::UninitializedTimeline; -use self::timeline::EvictionTaskTenantState; -use self::timeline::GcCutoffs; -use self::timeline::TimelineDeleteProgress; -use self::timeline::TimelineResources; -use self::timeline::WaitLsnError; -use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::deletion_queue::DeletionQueueClient; -use crate::deletion_queue::DeletionQueueError; -use crate::import_datadir; -use crate::l0_flush::L0FlushGlobalState; -use crate::metrics::CONCURRENT_INITDBS; -use crate::metrics::INITDB_RUN_TIME; -use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME; -use crate::metrics::TENANT; -use crate::metrics::{ - remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, - TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, +use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, UninitializedTimeline}; +use self::timeline::{ + EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError, +}; +use crate::config::PageServerConf; +use crate::context; +use crate::context::RequestContextBuilder; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; +use crate::l0_flush::L0FlushGlobalState; +use crate::metrics::{ + BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, + INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC, + TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, }; -use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::config::LocationMode; -use crate::tenant::config::TenantConfOpt; +use crate::tenant::config::{LocationMode, TenantConfOpt}; use crate::tenant::gc_result::GcResult; pub use crate::tenant::remote_timeline_client::index::IndexPart; -use crate::tenant::remote_timeline_client::remote_initdb_archive_path; -use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; -use crate::tenant::remote_timeline_client::INITDB_PATH; -use crate::tenant::storage_layer::DeltaLayer; -use crate::tenant::storage_layer::ImageLayer; -use crate::walingest::WalLagCooldown; -use crate::walredo; -use crate::InitializationOrder; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::collections::HashSet; -use std::fmt::Debug; -use std::fmt::Display; -use std::fs; -use std::fs::File; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; -use std::sync::Mutex; -use std::time::{Duration, Instant}; - -use crate::span; +use crate::tenant::remote_timeline_client::{ + INITDB_PATH, MaybeDeletedIndexPart, remote_initdb_archive_path, +}; +use crate::tenant::storage_layer::{DeltaLayer, ImageLayer}; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; +use crate::walingest::WalLagCooldown; use crate::walredo::PostgresRedoManager; -use crate::TEMP_FILE_SUFFIX; -use once_cell::sync::Lazy; -pub use pageserver_api::models::TenantState; -use tokio::sync::Semaphore; +use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo}; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); -use utils::{ - crashsafe, - generation::Generation, - id::TimelineId, - lsn::{Lsn, RecordLsn}, -}; +use utils::crashsafe; +use utils::generation::Generation; +use utils::id::TimelineId; +use utils::lsn::{Lsn, RecordLsn}; pub mod blob_io; pub mod block_io; @@ -183,9 +135,9 @@ mod gc_block; mod gc_result; pub(crate) mod throttle; -pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; +pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; @@ -250,7 +202,9 @@ impl AttachedTenantConf { Ok(Self::new(location_conf.tenant_conf, *attach_conf)) } LocationMode::Secondary(_) => { - anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") + anyhow::bail!( + "Attempted to construct AttachedTenantConf from a LocationConf in secondary mode" + ) } } } @@ -464,7 +418,9 @@ impl WalredoManagerId { static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed); if id == 0 { - panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique"); + panic!( + "WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique" + ); } Self(id) } @@ -1160,7 +1116,7 @@ impl Tenant { } }; - let timeline = self.create_timeline_struct( + let (timeline, timeline_ctx) = self.create_timeline_struct( timeline_id, &metadata, previous_heatmap, @@ -1168,6 +1124,9 @@ impl Tenant { resources, CreateTimelineCause::Load, idempotency.clone(), + index_part.gc_compaction.clone(), + index_part.rel_size_migration.clone(), + ctx, )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -1194,16 +1153,19 @@ impl Tenant { // a previous heatmap which contains all visible layers in the layer map. // This previous heatmap will be used whenever a fresh heatmap is generated // for the timeline. - if matches!(cause, LoadTimelineCause::Unoffload) { + if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) { let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn())); while let Some((tline, end_lsn)) = tline_ending_at { let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await; - if !tline.is_previous_heatmap_active() { + // Another unearchived timeline might have generated a heatmap for this ancestor. + // If the current branch point greater than the previous one use the the heatmap + // we just generated - it should include more layers. + if !tline.should_keep_previous_heatmap(end_lsn) { tline .previous_heatmap .store(Some(Arc::new(unarchival_heatmap))); } else { - tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.") + tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.") } match tline.ancestor_timeline() { @@ -1227,7 +1189,9 @@ impl Tenant { match cause { LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), LoadTimelineCause::ImportPgdata { .. } => { - unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3") + unreachable!( + "ImportPgdata should not be reloading timeline import is done and persisted as such in s3" + ) } } let mut guard = self.timelines_creating.lock().unwrap(); @@ -1260,8 +1224,8 @@ impl Tenant { // We should never try and load the same timeline twice during startup Entry::Occupied(_) => { unreachable!( - "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" - ); + "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" + ); } Entry::Vacant(v) => { v.insert(Arc::clone(&timeline)); @@ -1296,7 +1260,7 @@ impl Tenant { match activate { ActivateTimelineArgs::Yes { broker_client } => { info!("activating timeline after reload from pgdata import task"); - timeline.activate(self.clone(), broker_client, None, ctx); + timeline.activate(self.clone(), broker_client, None, &timeline_ctx); } ActivateTimelineArgs::No => (), } @@ -1621,6 +1585,10 @@ impl Tenant { } async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> { + if !self.conf.load_previous_heatmap { + return None; + } + let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id); match tokio::fs::read_to_string(on_disk_heatmap_path).await { Ok(heatmap) => match serde_json::from_str::(&heatmap) { @@ -1655,7 +1623,9 @@ impl Tenant { failpoint_support::sleep_millis_async!("before-attaching-tenant"); let Some(preload) = preload else { - anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); + anyhow::bail!( + "local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624" + ); }; let mut offloaded_timeline_ids = HashSet::new(); @@ -1798,6 +1768,7 @@ impl Tenant { import_pgdata, ActivateTimelineArgs::No, guard, + ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), )); } } @@ -1815,6 +1786,7 @@ impl Tenant { timeline_id, &index_part.metadata, remote_timeline_client, + ctx, ) .instrument(tracing::info_span!("timeline_delete", %timeline_id)) .await @@ -1980,6 +1952,7 @@ impl Tenant { hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active { heatmap: h, read_at: hs.1, + end_lsn: None, }) }); part_downloads.spawn( @@ -2039,7 +2012,7 @@ impl Tenant { remote_storage: GenericRemoteStorage, previous_heatmap: Option, cancel: CancellationToken, - ) -> impl Future { + ) -> impl Future + use<> { let client = self.build_timeline_client(timeline_id, remote_storage); async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -2251,7 +2224,7 @@ impl Tenant { self.clone(), broker_client.clone(), background_jobs_can_start, - &ctx, + &ctx.with_scope_timeline(&timeline), ); } @@ -2448,8 +2421,8 @@ impl Tenant { new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, - _ctx: &RequestContext, - ) -> anyhow::Result { + ctx: &RequestContext, + ) -> anyhow::Result<(UninitializedTimeline, RequestContext)> { anyhow::ensure!( self.is_active(), "Cannot create empty timelines on inactive tenant" @@ -2483,6 +2456,8 @@ impl Tenant { create_guard, initdb_lsn, None, + None, + ctx, ) .await } @@ -2500,7 +2475,7 @@ impl Tenant { pg_version: u32, ctx: &RequestContext, ) -> anyhow::Result> { - let uninit_tl = self + let (uninit_tl, ctx) = self .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx) .await?; let tline = uninit_tl.raw_timeline().expect("we just created it"); @@ -2512,7 +2487,7 @@ impl Tenant { .init_empty_test_timeline() .context("init_empty_test_timeline")?; modification - .commit(ctx) + .commit(&ctx) .await .context("commit init_empty_test_timeline modification")?; @@ -2538,6 +2513,7 @@ impl Tenant { initdb_lsn: Lsn, pg_version: u32, ctx: &RequestContext, + in_memory_layer_desc: Vec, delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, @@ -2559,6 +2535,11 @@ impl Tenant { .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx) .await?; } + for in_memory in in_memory_layer_desc { + tline + .force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx) + .await?; + } let layer_names = tline .layers .read() @@ -2724,7 +2705,12 @@ impl Tenant { // doing stuff before the IndexPart is durable in S3, which is done by the previous section. let activated_timeline = match result { CreateTimelineResult::Created(timeline) => { - timeline.activate(self.clone(), broker_client, None, ctx); + timeline.activate( + self.clone(), + broker_client, + None, + &ctx.with_scope_timeline(&timeline), + ); timeline } CreateTimelineResult::Idempotent(timeline) => { @@ -2734,7 +2720,9 @@ impl Tenant { timeline } CreateTimelineResult::ImportSpawned(timeline) => { - info!("import task spawned, timeline will become visible and activated once the import is done"); + info!( + "import task spawned, timeline will become visible and activated once the import is done" + ); timeline } }; @@ -2780,14 +2768,13 @@ impl Tenant { { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { - return Ok(CreateTimelineResult::Idempotent(timeline)) + return Ok(CreateTimelineResult::Idempotent(timeline)); } }; - let mut uninit_timeline = { + let (mut uninit_timeline, timeline_ctx) = { let this = &self; let initdb_lsn = Lsn(0); - let _ctx = ctx; async move { let new_metadata = TimelineMetadata::new( // Initialize disk_consistent LSN to 0, The caller must import some data to @@ -2806,6 +2793,8 @@ impl Tenant { timeline_create_guard, initdb_lsn, None, + None, + ctx, ) .await } @@ -2835,6 +2824,7 @@ impl Tenant { index_part, activate, timeline_create_guard, + timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), )); // NB: the timeline doesn't exist in self.timelines at this point @@ -2848,6 +2838,7 @@ impl Tenant { index_part: import_pgdata::index_part_format::Root, activate: ActivateTimelineArgs, timeline_create_guard: TimelineCreateGuard, + ctx: RequestContext, ) { debug_assert_current_span_has_tenant_and_timeline_id(); info!("starting"); @@ -2859,6 +2850,7 @@ impl Tenant { index_part, activate, timeline_create_guard, + ctx, ) .await; if let Err(err) = &res { @@ -2874,9 +2866,8 @@ impl Tenant { index_part: import_pgdata::index_part_format::Root, activate: ActivateTimelineArgs, timeline_create_guard: TimelineCreateGuard, + ctx: RequestContext, ) -> Result<(), anyhow::Error> { - let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn); - info!("importing pgdata"); import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone()) .await @@ -2914,7 +2905,9 @@ impl Tenant { let index_part = match index_part { MaybeDeletedIndexPart::Deleted(_) => { // likely concurrent delete call, cplane should prevent this - anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but") + anyhow::bail!( + "index part says deleted but we are not done creating yet, this should not happen but" + ) } MaybeDeletedIndexPart::IndexPart(p) => p, }; @@ -3083,6 +3076,7 @@ impl Tenant { let mut has_pending_l0 = false; for timeline in compact_l0 { + let ctx = &ctx.with_scope_timeline(&timeline); let outcome = timeline .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx) .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) @@ -3116,6 +3110,7 @@ impl Tenant { if !timeline.is_active() { continue; } + let ctx = &ctx.with_scope_timeline(&timeline); let mut outcome = timeline .compact(cancel, EnumSet::default(), ctx) @@ -3125,20 +3120,19 @@ impl Tenant { // If we're done compacting, check the scheduled GC compaction queue for more work. if outcome == CompactionOutcome::Done { - let queue = self - .scheduled_compaction_tasks - .lock() - .unwrap() - .get(&timeline.timeline_id) - .cloned(); - if let Some(queue) = queue { - outcome = queue - .iteration(cancel, ctx, &self.gc_block, &timeline) - .instrument( - info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id), - ) - .await?; - } + let queue = { + let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard + .entry(timeline.timeline_id) + .or_insert_with(|| Arc::new(GcCompactionQueue::new())) + .clone() + }; + outcome = queue + .iteration(cancel, ctx, &self.gc_block, &timeline) + .instrument( + info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id), + ) + .await?; } // If we're done compacting, offload the timeline if requested. @@ -3179,11 +3173,13 @@ impl Tenant { /// Trips the compaction circuit breaker if appropriate. pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) { match err { + err if err.is_cancel() => {} CompactionError::ShuttingDown => (), // Offload failures don't trip the circuit breaker, since they're cheap to retry and // shouldn't block compaction. CompactionError::Offload(_) => {} CompactionError::CollectKeySpaceError(err) => { + // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch. self.compaction_circuit_breaker .lock() .unwrap() @@ -3195,6 +3191,7 @@ impl Tenant { .unwrap() .fail(&CIRCUIT_BREAKERS_BROKEN, err); } + CompactionError::AlreadyRunning(_) => {} } } @@ -3339,7 +3336,7 @@ impl Tenant { self.clone(), broker_client.clone(), background_jobs_can_start, - ctx, + &ctx.with_scope_timeline(timeline), ); activated_timelines += 1; } @@ -3845,6 +3842,7 @@ impl Tenant { resident_size: 0, physical_size: 0, max_logical_size: 0, + max_logical_size_per_shard: 0, }; for timeline in self.timelines.lock().unwrap().values() { @@ -3861,6 +3859,10 @@ impl Tenant { ); } + result.max_logical_size_per_shard = result + .max_logical_size + .div_ceil(self.tenant_shard_id.shard_count.count() as u64); + result } } @@ -3905,7 +3907,9 @@ where if !later.is_empty() { for (missing_id, orphan_ids) in later { for (orphan_id, _) in orphan_ids { - error!("could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded"); + error!( + "could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded" + ); } } bail!("could not load tenant because some timelines are missing ancestors"); @@ -4150,7 +4154,10 @@ impl Tenant { resources: TimelineResources, cause: CreateTimelineCause, create_idempotency: CreateTimelineIdempotency, - ) -> anyhow::Result> { + gc_compaction_state: Option, + rel_size_v2_status: Option, + ctx: &RequestContext, + ) -> anyhow::Result<(Arc, RequestContext)> { let state = match cause { CreateTimelineCause::Load => { let ancestor_id = new_metadata.ancestor_timeline(); @@ -4181,10 +4188,16 @@ impl Tenant { state, self.attach_wal_lag_cooldown.clone(), create_idempotency, + gc_compaction_state, + rel_size_v2_status, self.cancel.child_token(), ); - Ok(timeline) + let timeline_ctx = RequestContextBuilder::extend(ctx) + .scope(context::Scope::new_timeline(&timeline)) + .build(); + + Ok((timeline, timeline_ctx)) } /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object @@ -4600,6 +4613,7 @@ impl Tenant { // Ensures all timelines use the same start time when computing the time cutoff. let now_ts_for_pitr_calc = SystemTime::now(); for timeline in timelines.iter() { + let ctx = &ctx.with_scope_timeline(timeline); let cutoff = timeline .get_last_record_lsn() .checked_sub(horizon) @@ -4773,7 +4787,7 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> Result { let src_id = src_timeline.timeline_id; @@ -4823,7 +4837,10 @@ impl Tenant { let gc_info = src_timeline.gc_info.read().unwrap(); let planned_cutoff = gc_info.min_cutoff(); if gc_info.lsn_covered_by_lease(start_lsn) { - tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn); + tracing::info!( + "skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", + *applied_gc_cutoff_lsn + ); } else { src_timeline .check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn) @@ -4873,13 +4890,15 @@ impl Tenant { src_timeline.pg_version, ); - let uninitialized_timeline = self + let (uninitialized_timeline, _timeline_ctx) = self .prepare_new_timeline( dst_id, &metadata, timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), + Some(src_timeline.get_rel_size_v2_status()), + ctx, ) .await?; @@ -4969,7 +4988,9 @@ impl Tenant { } // Idempotent <=> CreateTimelineIdempotency is identical (x, y) if x == y => { - info!("timeline already exists and idempotency matches, succeeding request"); + info!( + "timeline already exists and idempotency matches, succeeding request" + ); // fallthrough } (_, _) => { @@ -5051,7 +5072,7 @@ impl Tenant { { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { - return Ok(CreateTimelineResult::Idempotent(timeline)) + return Ok(CreateTimelineResult::Idempotent(timeline)); } }; @@ -5144,13 +5165,15 @@ impl Tenant { pgdata_lsn, pg_version, ); - let mut raw_timeline = self + let (mut raw_timeline, timeline_ctx) = self .prepare_new_timeline( timeline_id, &new_metadata, timeline_create_guard, pgdata_lsn, None, + None, + ctx, ) .await?; @@ -5161,7 +5184,7 @@ impl Tenant { &unfinished_timeline, &pgdata_path, pgdata_lsn, - ctx, + &timeline_ctx, ) .await .with_context(|| { @@ -5222,6 +5245,7 @@ impl Tenant { /// An empty layer map is initialized, and new data and WAL can be imported starting /// at 'disk_consistent_lsn'. After any initial data has been imported, call /// `finish_creation` to insert the Timeline into the timelines map. + #[allow(clippy::too_many_arguments)] async fn prepare_new_timeline<'a>( &'a self, new_timeline_id: TimelineId, @@ -5229,15 +5253,17 @@ impl Tenant { create_guard: TimelineCreateGuard, start_lsn: Lsn, ancestor: Option>, - ) -> anyhow::Result> { + rel_size_v2_status: Option, + ctx: &RequestContext, + ) -> anyhow::Result<(UninitializedTimeline<'a>, RequestContext)> { let tenant_shard_id = self.tenant_shard_id; let resources = self.build_timeline_resources(new_timeline_id); resources .remote_client - .init_upload_queue_for_empty_remote(new_metadata)?; + .init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?; - let timeline_struct = self + let (timeline_struct, timeline_ctx) = self .create_timeline_struct( new_timeline_id, new_metadata, @@ -5246,6 +5272,9 @@ impl Tenant { resources, CreateTimelineCause::Load, create_guard.idempotency.clone(), + None, + rel_size_v2_status, + ctx, ) .context("Failed to create timeline data structure")?; @@ -5255,7 +5284,9 @@ impl Tenant { .create_timeline_files(&create_guard.timeline_path) .await { - error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"); + error!( + "Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}" + ); cleanup_timeline_directory(create_guard); return Err(e); } @@ -5264,10 +5295,13 @@ impl Tenant { "Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}" ); - Ok(UninitializedTimeline::new( - self, - new_timeline_id, - Some((timeline_struct, create_guard)), + Ok(( + UninitializedTimeline::new( + self, + new_timeline_id, + Some((timeline_struct, create_guard)), + ), + timeline_ctx, )) } @@ -5620,20 +5654,19 @@ pub async fn dump_layerfile_from_path( #[cfg(test)] pub(crate) mod harness { use bytes::{Bytes, BytesMut}; + use hex_literal::hex; use once_cell::sync::OnceCell; + use pageserver_api::key::Key; use pageserver_api::models::ShardParameters; + use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::ShardIndex; + use utils::id::TenantId; use utils::logging; + use super::*; use crate::deletion_queue::mock::MockDeletionQueue; use crate::l0_flush::L0FlushConfig; use crate::walredo::apply_neon; - use pageserver_api::key::Key; - use pageserver_api::record::NeonWalRecord; - - use super::*; - use hex_literal::hex; - use utils::id::TenantId; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -5721,7 +5754,7 @@ pub(crate) mod harness { logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, logging::Output::Stdout, ) - .expect("Failed to init test logging") + .expect("Failed to init test logging"); }); } @@ -5803,7 +5836,8 @@ pub(crate) mod harness { } pub(crate) async fn load(&self) -> (Arc, RequestContext) { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) + .with_scope_unit_test(); ( self.do_try_load(&ctx) .await @@ -5914,34 +5948,36 @@ pub(crate) mod harness { mod tests { use std::collections::{BTreeMap, BTreeSet}; - use super::*; - use crate::keyspace::KeySpaceAccum; - use crate::tenant::harness::*; - use crate::tenant::timeline::CompactFlags; - use crate::DEFAULT_PG_VERSION; use bytes::{Bytes, BytesMut}; use hex_literal::hex; use itertools::Itertools; - use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; + #[cfg(feature = "testing")] + use models::CompactLsnRange; + use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; + #[cfg(feature = "testing")] + use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; use pageserver_compaction::helpers::overlaps_with; - use rand::{thread_rng, Rng}; + use rand::{Rng, thread_rng}; use storage_layer::{IoConcurrency, PersistentLayerKey}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; + #[cfg(feature = "testing")] + use timeline::GcInfo; + #[cfg(feature = "testing")] + use timeline::InMemoryLayerTestDesc; + #[cfg(feature = "testing")] + use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{CompactOptions, DeltaLayerTestDesc}; use utils::id::TenantId; - #[cfg(feature = "testing")] - use models::CompactLsnRange; - #[cfg(feature = "testing")] - use pageserver_api::record::NeonWalRecord; - #[cfg(feature = "testing")] - use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; - #[cfg(feature = "testing")] - use timeline::GcInfo; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::keyspace::KeySpaceAccum; + use crate::tenant::harness::*; + use crate::tenant::timeline::CompactFlags; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -6191,11 +6227,12 @@ mod tests { panic!("wrong error type") }; assert!(err.to_string().contains("invalid branch start lsn")); - assert!(err - .source() - .unwrap() - .to_string() - .contains("we might've already garbage collected needed data")) + assert!( + err.source() + .unwrap() + .to_string() + .contains("we might've already garbage collected needed data") + ) } } @@ -6224,11 +6261,12 @@ mod tests { panic!("wrong error type"); }; assert!(&err.to_string().contains("invalid branch start lsn")); - assert!(&err - .source() - .unwrap() - .to_string() - .contains("is earlier than latest GC cutoff")); + assert!( + &err.source() + .unwrap() + .to_string() + .contains("is earlier than latest GC cutoff") + ); } } @@ -6521,7 +6559,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; let mut writer = tline.writer().await; @@ -6538,7 +6580,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; let mut writer = tline.writer().await; @@ -6555,7 +6601,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; let mut writer = tline.writer().await; @@ -6572,7 +6622,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; assert_eq!( @@ -6655,7 +6709,9 @@ mod tests { timeline.freeze_and_flush().await?; if compact { // this requires timeline to be &Arc - timeline.compact(&cancel, EnumSet::empty(), ctx).await?; + timeline + .compact(&cancel, CompactFlags::NoYield.into(), ctx) + .await?; } // this doesn't really need to use the timeline_id target, but it is closer to what it @@ -6822,7 +6878,7 @@ mod tests { let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); - let tline = tenant + let (tline, ctx) = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; let tline = tline.raw_timeline().unwrap(); @@ -6982,6 +7038,7 @@ mod tests { child_timeline.freeze_and_flush().await?; let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); child_timeline .compact(&CancellationToken::new(), flags, &ctx) .await?; @@ -7360,7 +7417,9 @@ mod tests { // Perform a cycle of flush, compact, and GC tline.freeze_and_flush().await?; - tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tline + .compact(&cancel, CompactFlags::NoYield.into(), &ctx) + .await?; tenant .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; @@ -7444,7 +7503,7 @@ mod tests { .await; let initdb_lsn = Lsn(0x20); - let utline = tenant + let (utline, ctx) = tenant .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx) .await?; let tline = utline.raw_timeline().unwrap(); @@ -7511,7 +7570,7 @@ mod tests { let harness = TenantHarness::create(name).await?; { let (tenant, ctx) = harness.load().await; - let tline = tenant + let (tline, _ctx) = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again @@ -7537,10 +7596,12 @@ mod tests { } } - assert!(!harness - .conf - .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) - .exists()); + assert!( + !harness + .conf + .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) + .exists() + ); Ok(()) } @@ -7687,6 +7748,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags } else { EnumSet::empty() @@ -7737,11 +7799,16 @@ mod tests { let before_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); - tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tline + .compact(&cancel, CompactFlags::NoYield.into(), &ctx) + .await?; let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); - assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); + assert!( + after_num_l0_delta_files < before_num_l0_delta_files, + "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}" + ); assert_eq!( tline.get(test_key, lsn, &ctx).await?, @@ -7850,7 +7917,6 @@ mod tests { Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) } - #[allow(clippy::needless_range_loop)] for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = (blknum * STEP) as u32; @@ -7900,6 +7966,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -7908,7 +7975,10 @@ mod tests { let (_, after_delta_file_accessed) = scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone()) .await?; - assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); + assert!( + after_delta_file_accessed < before_delta_file_accessed, + "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}" + ); // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. assert!( after_delta_file_accessed <= 2, @@ -7935,6 +8005,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers Vec::new(), // delta layers vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN @@ -7962,10 +8033,12 @@ mod tests { get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, Some(test_img("data key 1")) ); - assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) - .await - .unwrap_err() - .is_missing_key_error()); + assert!( + get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); assert!( get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) .await @@ -8020,6 +8093,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers Vec::new(), // delta layers vec![( Lsn(0x20), @@ -8235,6 +8309,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( @@ -8315,6 +8390,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( @@ -8353,6 +8429,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8388,6 +8465,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( @@ -8419,6 +8497,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8520,6 +8599,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), @@ -8713,6 +8793,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x40), delta1, @@ -8769,6 +8850,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers Vec::new(), image_layers, end_lsn, @@ -8975,6 +9057,7 @@ mod tests { Lsn(0x08), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x08)..Lsn(0x10), @@ -8993,7 +9076,7 @@ mod tests { delta3, ), ], // delta layers - vec![], // image layers + vec![], // image layers Lsn(0x50), ) .await? @@ -9004,6 +9087,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x48), @@ -9554,6 +9638,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), @@ -9801,6 +9886,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ // delta1 and delta 2 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1), @@ -10036,6 +10122,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![], // delta layers vec![(Lsn(0x18), img_layer)], // image layers Lsn(0x18), @@ -10282,6 +10369,7 @@ mod tests { baseline_image_layer_lsn, DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![DeltaLayerTestDesc::new_with_inferred_key_range( delta_layer_start_lsn..delta_layer_end_lsn, delta_layer_spec, @@ -10313,6 +10401,158 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> { + let harness = + TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?; + let (tenant, ctx) = harness.load().await; + + let will_init_keys = [2, 6]; + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let mut expected_key_values = HashMap::new(); + + let baseline_image_layer_lsn = Lsn(0x10); + let mut baseline_img_layer = Vec::new(); + for i in 0..5 { + let key = get_key(i); + let value = format!("value {i}@{baseline_image_layer_lsn}"); + + let removed = expected_key_values.insert(key, value.clone()); + assert!(removed.is_none()); + + baseline_img_layer.push((key, Bytes::from(value))); + } + + let nested_image_layer_lsn = Lsn(0x50); + let mut nested_img_layer = Vec::new(); + for i in 5..10 { + let key = get_key(i); + let value = format!("value {i}@{nested_image_layer_lsn}"); + + let removed = expected_key_values.insert(key, value.clone()); + assert!(removed.is_none()); + + nested_img_layer.push((key, Bytes::from(value))); + } + + let frozen_layer = { + let lsn_range = Lsn(0x40)..Lsn(0x60); + let mut data = Vec::new(); + for i in 0..10 { + let key = get_key(i); + let key_in_nested = nested_img_layer + .iter() + .any(|(key_with_img, _)| *key_with_img == key); + let lsn = { + if key_in_nested { + Lsn(nested_image_layer_lsn.0 + 5) + } else { + lsn_range.start + } + }; + + let will_init = will_init_keys.contains(&i); + if will_init { + data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init("")))); + + expected_key_values.insert(key, "".to_string()); + } else { + let delta = format!("@{lsn}"); + data.push(( + key, + lsn, + Value::WalRecord(NeonWalRecord::wal_append(&delta)), + )); + + expected_key_values + .get_mut(&key) + .expect("An image exists for each key") + .push_str(delta.as_str()); + } + } + + InMemoryLayerTestDesc { + lsn_range, + is_open: false, + data, + } + }; + + let (open_layer, last_record_lsn) = { + let start_lsn = Lsn(0x70); + let mut data = Vec::new(); + let mut end_lsn = Lsn(0); + for i in 0..10 { + let key = get_key(i); + let lsn = Lsn(start_lsn.0 + i as u64); + let delta = format!("@{lsn}"); + data.push(( + key, + lsn, + Value::WalRecord(NeonWalRecord::wal_append(&delta)), + )); + + expected_key_values + .get_mut(&key) + .expect("An image exists for each key") + .push_str(delta.as_str()); + + end_lsn = std::cmp::max(end_lsn, lsn); + } + + ( + InMemoryLayerTestDesc { + lsn_range: start_lsn..Lsn::MAX, + is_open: true, + data, + }, + end_lsn, + ) + }; + + assert!( + nested_image_layer_lsn > frozen_layer.lsn_range.start + && nested_image_layer_lsn < frozen_layer.lsn_range.end + ); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + baseline_image_layer_lsn, + DEFAULT_PG_VERSION, + &ctx, + vec![open_layer, frozen_layer], // in-memory layers + Vec::new(), // delta layers + vec![ + (baseline_image_layer_lsn, baseline_img_layer), + (nested_image_layer_lsn, nested_img_layer), + ], // image layers + last_record_lsn, + ) + .await?; + + let keyspace = KeySpace::single(get_key(0)..get_key(10)); + let results = tline + .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx) + .await + .expect("No vectored errors"); + for (key, res) in results { + let value = res.expect("No key errors"); + let expected_value = expected_key_values.remove(&key).expect("No unknown keys"); + assert_eq!(value, Bytes::from(expected_value.clone())); + + tracing::info!("key={key} value={expected_value}"); + } + + Ok(()) + } + fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering { ( k1.is_delta, @@ -10428,6 +10668,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), @@ -10812,6 +11053,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![ // delta1/2/4 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1), @@ -11063,6 +11305,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![ // delta1/2/4 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1), diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 7b55df52a5..ff9a7e57b6 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -14,6 +14,9 @@ //! len < 128: 0XXXXXXX //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use std::cmp::min; +use std::io::{Error, ErrorKind}; + use async_compression::Level; use bytes::{BufMut, BytesMut}; use pageserver_api::models::ImageCompressionAlgorithm; @@ -24,10 +27,8 @@ use tracing::warn; use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; -use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::VirtualFile; -use std::cmp::min; -use std::io::{Error, ErrorKind}; +use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; #[derive(Copy, Clone, Debug)] pub struct CompressionInfo { @@ -414,12 +415,15 @@ impl BlobWriter { #[cfg(test)] pub(crate) mod tests { - use super::*; - use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use rand::{Rng, SeedableRng}; + use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::tenant::block_io::BlockReaderRef; + async fn round_trip_test(blobs: &[Vec]) -> Result<(), Error> { round_trip_test_compressed::(blobs, false).await } @@ -467,7 +471,8 @@ pub(crate) mod tests { blobs: &[Vec], compression: bool, ) -> Result<(), Error> { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let (_temp_dir, pathbuf, offsets) = write_maybe_compressed::(blobs, compression, &ctx).await?; @@ -486,7 +491,7 @@ pub(crate) mod tests { pub(crate) fn random_array(len: usize) -> Vec { let mut rng = rand::thread_rng(); - (0..len).map(|_| rng.gen()).collect::<_>() + (0..len).map(|_| rng.r#gen()).collect::<_>() } #[tokio::test] @@ -544,9 +549,9 @@ pub(crate) mod tests { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let blobs = (0..1024) .map(|_| { - let mut sz: u16 = rng.gen(); + let mut sz: u16 = rng.r#gen(); // Make 50% of the arrays small - if rng.gen() { + if rng.r#gen() { sz &= 63; } random_array(sz.into()) diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 990211f80a..66c586daff 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -2,14 +2,16 @@ //! Low-level Block-oriented I/O functions //! +use std::ops::Deref; + +use bytes::Bytes; + use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; -use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; +use crate::page_cache::{self, FileId, PAGE_SZ, PageReadGuard, PageWriteGuard, ReadBufResult}; #[cfg(test)] use crate::virtual_file::IoBufferMut; use crate::virtual_file::VirtualFile; -use bytes::Bytes; -use std::ops::Deref; /// This is implemented by anything that can read 8 kB (PAGE_SZ) /// blocks, using the page cache diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs index f98356242e..d5b979ab2a 100644 --- a/pageserver/src/tenant/checks.rs +++ b/pageserver/src/tenant/checks.rs @@ -63,9 +63,9 @@ pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { && overlaps_with(&layer.key_range, &other_layer.key_range) { let err = format!( - "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", - layer, other_layer - ); + "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", + layer, other_layer + ); return Some(err); } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index ab4c4c935d..4308db84e5 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -8,16 +8,17 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! +use std::num::NonZeroU64; +use std::time::Duration; + pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; -use pageserver_api::models::CompactionAlgorithmSettings; -use pageserver_api::models::EvictionPolicy; -use pageserver_api::models::{self, TenantConfigPatch}; +use pageserver_api::models::{ + self, CompactionAlgorithmSettings, EvictionPolicy, TenantConfigPatch, +}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; use serde_json::Value; -use std::num::NonZeroU64; -use std::time::Duration; use utils::generation::Generation; use utils::postgres_client::PostgresClientProtocol; @@ -218,7 +219,11 @@ impl LocationConf { }; let shard = if conf.shard_count == 0 { - ShardIdentity::unsharded() + // NB: carry over the persisted stripe size instead of using the default. This doesn't + // matter for most practical purposes, since unsharded tenants don't use the stripe + // size, but can cause inconsistencies between storcon and Pageserver and cause manual + // splits without `new_stripe_size` to use an unintended stripe size. + ShardIdentity::unsharded_with_stripe_size(ShardStripeSize(conf.shard_stripe_size)) } else { ShardIdentity::new( ShardNumber(conf.shard_number), @@ -739,9 +744,10 @@ impl From for models::TenantConfig { #[cfg(test)] mod tests { - use super::*; use models::TenantConfig; + use super::*; + #[test] fn de_serializing_pageserver_config_omits_empty_values() { let small_conf = TenantConfOpt { diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index bb9df020b5..1791e5996c 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -18,27 +18,22 @@ //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! +use std::cmp::Ordering; +use std::iter::Rev; +use std::ops::{Range, RangeInclusive}; +use std::{io, result}; + use async_stream::try_stream; -use byteorder::{ReadBytesExt, BE}; +use byteorder::{BE, ReadBytesExt}; use bytes::{BufMut, Bytes, BytesMut}; use either::Either; use futures::{Stream, StreamExt}; use hex; -use std::{ - cmp::Ordering, - io, - iter::Rev, - ops::{Range, RangeInclusive}, - result, -}; use thiserror::Error; use tracing::error; -use crate::{ - context::{DownloadBehavior, RequestContext}, - task_mgr::TaskKind, - tenant::block_io::{BlockReader, BlockWriter}, -}; +use crate::context::RequestContext; +use crate::tenant::block_io::{BlockReader, BlockWriter}; // The maximum size of a value stored in the B-tree. 5 bytes is enough currently. pub const VALUE_SZ: usize = 5; @@ -481,16 +476,15 @@ where } #[allow(dead_code)] - pub async fn dump(&self) -> Result<()> { + pub async fn dump(&self, ctx: &RequestContext) -> Result<()> { let mut stack = Vec::new(); - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); stack.push((self.root_blk, String::new(), 0, 0, 0)); let block_cursor = self.reader.block_cursor(); while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() { - let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?; + let blk = block_cursor.read_blk(self.start_blk + blknum, ctx).await?; let buf: &[u8] = blk.as_ref(); let node = OnDiskNode::::deparse(buf)?; @@ -833,12 +827,16 @@ impl BuildNode { #[cfg(test)] pub(crate) mod tests { - use super::*; - use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; - use rand::Rng; use std::collections::BTreeMap; use std::sync::atomic::{AtomicUsize, Ordering}; + use rand::Rng; + + use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; + #[derive(Clone, Default)] pub(crate) struct TestDisk { blocks: Vec, @@ -871,7 +869,8 @@ pub(crate) mod tests { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let all_keys: Vec<&[u8; 6]> = vec![ b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb", @@ -889,7 +888,7 @@ pub(crate) mod tests { let reader = DiskBtreeReader::new(0, root_offset, disk); - reader.dump().await?; + reader.dump(&ctx).await?; // Test the `get` function on all the keys. for (key, val) in all_data.iter() { @@ -981,7 +980,8 @@ pub(crate) mod tests { async fn lots_of_keys() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); const NUM_KEYS: u64 = 1000; @@ -999,7 +999,7 @@ pub(crate) mod tests { let reader = DiskBtreeReader::new(0, root_offset, disk); - reader.dump().await?; + reader.dump(&ctx).await?; use std::sync::Mutex; @@ -1115,7 +1115,7 @@ pub(crate) mod tests { // Test get() operations on random keys, most of which will not exist for _ in 0..100000 { - let key_int = rand::thread_rng().gen::(); + let key_int = rand::thread_rng().r#gen::(); let search_key = u128::to_be_bytes(key_int); assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned()); } @@ -1169,7 +1169,8 @@ pub(crate) mod tests { // Build a tree from it let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); for (key, val) in disk_btree_test_data::TEST_DATA { writer.append(&key, val)?; @@ -1200,7 +1201,7 @@ pub(crate) mod tests { .await?; assert_eq!(count, disk_btree_test_data::TEST_DATA.len()); - reader.dump().await?; + reader.dump(&ctx).await?; Ok(()) } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index ba79672bc7..396d930f77 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -1,6 +1,17 @@ //! Implementation of append-only file data structure //! used to keep in-memory layers spilled on disk. +use std::io; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; + +use camino::Utf8PathBuf; +use num_traits::Num; +use pageserver_api::shard::TenantShardId; +use tokio_epoll_uring::{BoundedBuf, Slice}; +use tracing::{error, info_span}; +use utils::id::TimelineId; + use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::RequestContext; @@ -9,17 +20,7 @@ use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; use crate::virtual_file::owned_buffers_io::write::Buffer; -use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile}; -use camino::Utf8PathBuf; -use num_traits::Num; -use pageserver_api::shard::TenantShardId; -use tokio_epoll_uring::{BoundedBuf, Slice}; -use tracing::error; - -use std::io; -use std::sync::atomic::AtomicU64; -use std::sync::Arc; -use utils::id::TimelineId; +use crate::virtual_file::{self, IoBufferMut, VirtualFile, owned_buffers_io}; pub struct EphemeralFile { _tenant_shard_id: TenantShardId, @@ -75,6 +76,7 @@ impl EphemeralFile { || IoBufferMut::with_capacity(TAIL_SZ), gate.enter()?, ctx, + info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename), ), _gate_guard: gate.enter()?, }) @@ -319,13 +321,14 @@ pub fn is_ephemeral_file(filename: &str) -> bool { #[cfg(test)] mod tests { + use std::fs; + use std::str::FromStr; + use rand::Rng; use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - use std::fs; - use std::str::FromStr; fn harness( test_name: &str, @@ -349,7 +352,8 @@ mod tests { let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?; - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); Ok((conf, tenant_shard_id, timeline_id, ctx)) } diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs index af73acb2be..7aa920c953 100644 --- a/pageserver/src/tenant/gc_block.rs +++ b/pageserver/src/tenant/gc_block.rs @@ -1,4 +1,5 @@ -use std::{collections::HashMap, sync::Arc}; +use std::collections::HashMap; +use std::sync::Arc; use utils::id::TimelineId; diff --git a/pageserver/src/tenant/gc_result.rs b/pageserver/src/tenant/gc_result.rs index c805aafeab..7a7d6d19cb 100644 --- a/pageserver/src/tenant/gc_result.rs +++ b/pageserver/src/tenant/gc_result.rs @@ -1,8 +1,9 @@ -use anyhow::Result; -use serde::Serialize; use std::ops::AddAssign; use std::time::Duration; +use anyhow::Result; +use serde::Serialize; + /// /// Result of performing GC /// diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index a69cce932e..2b04e53f10 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -46,24 +46,23 @@ mod historic_layer_coverage; mod layer_coverage; -use crate::context::RequestContext; -use crate::keyspace::KeyPartitioning; -use crate::tenant::storage_layer::InMemoryLayer; -use anyhow::Result; -use pageserver_api::key::Key; -use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; -use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use std::collections::{HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; + +use anyhow::Result; +use historic_layer_coverage::BufferedHistoricLayerCoverage; +pub use historic_layer_coverage::LayerKey; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; +use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use tokio::sync::watch; use utils::lsn::Lsn; -use historic_layer_coverage::BufferedHistoricLayerCoverage; -pub use historic_layer_coverage::LayerKey; - use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; +use crate::context::RequestContext; +use crate::tenant::storage_layer::{InMemoryLayer, ReadableLayerWeak}; /// /// LayerMap tracks what layers exist on a timeline. @@ -167,7 +166,7 @@ impl Drop for BatchedUpdates<'_> { /// Return value of LayerMap::search #[derive(Eq, PartialEq, Debug, Hash)] pub struct SearchResult { - pub layer: Arc, + pub layer: ReadableLayerWeak, pub lsn_floor: Lsn, } @@ -175,19 +174,37 @@ pub struct SearchResult { /// /// Contains a mapping from a layer description to a keyspace /// accumulator that contains all the keys which intersect the layer -/// from the original search space. Keys that were not found are accumulated -/// in a separate key space accumulator. +/// from the original search space. #[derive(Debug)] pub struct RangeSearchResult { pub found: HashMap, - pub not_found: KeySpaceAccum, } impl RangeSearchResult { fn new() -> Self { Self { found: HashMap::new(), - not_found: KeySpaceAccum::new(), + } + } + + fn map_to_in_memory_layer( + in_memory_layer: Option, + range: Range, + ) -> RangeSearchResult { + match in_memory_layer { + Some(inmem) => { + let search_result = SearchResult { + lsn_floor: inmem.get_lsn_range().start, + layer: ReadableLayerWeak::InMemoryLayer(inmem), + }; + + let mut accum = KeySpaceAccum::new(); + accum.add_range(range); + RangeSearchResult { + found: HashMap::from([(search_result, accum)]), + } + } + None => RangeSearchResult::new(), } } } @@ -199,6 +216,7 @@ struct RangeSearchCollector where Iter: Iterator>)>, { + in_memory_layer: Option, delta_coverage: Peekable, image_coverage: Peekable, key_range: Range, @@ -234,10 +252,12 @@ where fn new( key_range: Range, end_lsn: Lsn, + in_memory_layer: Option, delta_coverage: Iter, image_coverage: Iter, ) -> Self { Self { + in_memory_layer, delta_coverage: delta_coverage.peekable(), image_coverage: image_coverage.peekable(), key_range, @@ -266,8 +286,7 @@ where return self.result; } Some(layer_type) => { - // Changes for the range exist. Record anything before the first - // coverage change as not found. + // Changes for the range exist. let coverage_start = layer_type.next_change_at_key(); let range_before = self.key_range.start..coverage_start; self.pad_range(range_before); @@ -297,10 +316,22 @@ where self.result } - /// Mark a range as not found (i.e. no layers intersect it) + /// Map a range which does not intersect any persistent layers to + /// the in-memory layer candidate. fn pad_range(&mut self, key_range: Range) { if !key_range.is_empty() { - self.result.not_found.add_range(key_range); + if let Some(ref inmem) = self.in_memory_layer { + let search_result = SearchResult { + layer: ReadableLayerWeak::InMemoryLayer(inmem.clone()), + lsn_floor: inmem.get_lsn_range().start, + }; + + self.result + .found + .entry(search_result) + .or_default() + .add_range(key_range); + } } } @@ -310,6 +341,7 @@ where let selected = LayerMap::select_layer( self.current_delta.clone(), self.current_image.clone(), + self.in_memory_layer.clone(), self.end_lsn, ); @@ -365,6 +397,24 @@ where } } +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub struct InMemoryLayerDesc { + handle: InMemoryLayerHandle, + lsn_range: Range, +} + +impl InMemoryLayerDesc { + pub(crate) fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +enum InMemoryLayerHandle { + Open, + Frozen(usize), +} + impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given @@ -394,69 +444,161 @@ impl LayerMap { /// layer result, or simplify the api to `get_latest_image` and /// `get_latest_delta`, and only call `get_latest_image` once. /// - /// NOTE: This only searches the 'historic' layers, *not* the - /// 'open' and 'frozen' layers! - /// pub fn search(&self, key: Key, end_lsn: Lsn) -> Option { - let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; + let in_memory_layer = self.search_in_memory_layer(end_lsn); + + let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { + Some(version) => version, + None => { + return in_memory_layer.map(|desc| SearchResult { + lsn_floor: desc.get_lsn_range().start, + layer: ReadableLayerWeak::InMemoryLayer(desc), + }); + } + }; + let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); - Self::select_layer(latest_delta, latest_image, end_lsn) + Self::select_layer(latest_delta, latest_image, in_memory_layer, end_lsn) } + /// Select a layer from three potential candidates (in-memory, delta and image layer). + /// The candidates represent the first layer of each type which intersect a key range. + /// + /// Layer types have an in implicit priority (image > delta > in-memory). For instance, + /// if we have the option of reading an LSN range from both an image and a delta, we + /// should read from the image. fn select_layer( delta_layer: Option>, image_layer: Option>, + in_memory_layer: Option, end_lsn: Lsn, ) -> Option { assert!(delta_layer.as_ref().is_none_or(|l| l.is_delta())); assert!(image_layer.as_ref().is_none_or(|l| !l.is_delta())); - match (delta_layer, image_layer) { - (None, None) => None, - (None, Some(image)) => { + match (delta_layer, image_layer, in_memory_layer) { + (None, None, None) => None, + (None, Some(image), None) => { let lsn_floor = image.get_lsn_range().start; Some(SearchResult { - layer: image, + layer: ReadableLayerWeak::PersistentLayer(image), lsn_floor, }) } - (Some(delta), None) => { + (Some(delta), None, None) => { let lsn_floor = delta.get_lsn_range().start; Some(SearchResult { - layer: delta, + layer: ReadableLayerWeak::PersistentLayer(delta), lsn_floor, }) } - (Some(delta), Some(image)) => { + (Some(delta), Some(image), None) => { let img_lsn = image.get_lsn_range().start; let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; let image_exact_match = img_lsn + 1 == end_lsn; if image_is_newer || image_exact_match { Some(SearchResult { - layer: image, + layer: ReadableLayerWeak::PersistentLayer(image), + lsn_floor: img_lsn, + }) + } else { + // If the delta overlaps with the image in the LSN dimension, do a partial + // up to the image layer. + let lsn_floor = + std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); + Some(SearchResult { + layer: ReadableLayerWeak::PersistentLayer(delta), + lsn_floor, + }) + } + } + (None, None, Some(inmem)) => { + let lsn_floor = inmem.get_lsn_range().start; + Some(SearchResult { + layer: ReadableLayerWeak::InMemoryLayer(inmem), + lsn_floor, + }) + } + (None, Some(image), Some(inmem)) => { + // If the in-memory layer overlaps with the image in the LSN dimension, do a partial + // up to the image layer. + let img_lsn = image.get_lsn_range().start; + let image_is_newer = image.get_lsn_range().end >= inmem.get_lsn_range().end; + let image_exact_match = img_lsn + 1 == end_lsn; + if image_is_newer || image_exact_match { + Some(SearchResult { + layer: ReadableLayerWeak::PersistentLayer(image), lsn_floor: img_lsn, }) } else { let lsn_floor = - std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); + std::cmp::max(inmem.get_lsn_range().start, image.get_lsn_range().start + 1); Some(SearchResult { - layer: delta, + layer: ReadableLayerWeak::InMemoryLayer(inmem), lsn_floor, }) } } + (Some(delta), None, Some(inmem)) => { + // Overlaps between delta and in-memory layers are not a valid + // state, but we handle them here for completeness. + let delta_end = delta.get_lsn_range().end; + let delta_is_newer = delta_end >= inmem.get_lsn_range().end; + let delta_exact_match = delta_end == end_lsn; + if delta_is_newer || delta_exact_match { + Some(SearchResult { + lsn_floor: delta.get_lsn_range().start, + layer: ReadableLayerWeak::PersistentLayer(delta), + }) + } else { + // If the in-memory layer overlaps with the delta in the LSN dimension, do a partial + // up to the delta layer. + let lsn_floor = + std::cmp::max(inmem.get_lsn_range().start, delta.get_lsn_range().end); + Some(SearchResult { + layer: ReadableLayerWeak::InMemoryLayer(inmem), + lsn_floor, + }) + } + } + (Some(delta), Some(image), Some(inmem)) => { + // Determine the preferred persistent layer without taking the in-memory layer + // into consideration. + let persistent_res = + Self::select_layer(Some(delta.clone()), Some(image.clone()), None, end_lsn) + .unwrap(); + let persistent_l = match persistent_res.layer { + ReadableLayerWeak::PersistentLayer(l) => l, + ReadableLayerWeak::InMemoryLayer(_) => unreachable!(), + }; + + // Now handle the in-memory layer overlaps. + let inmem_res = if persistent_l.is_delta() { + Self::select_layer(Some(persistent_l), None, Some(inmem.clone()), end_lsn) + .unwrap() + } else { + Self::select_layer(None, Some(persistent_l), Some(inmem.clone()), end_lsn) + .unwrap() + }; + + Some(SearchResult { + layer: inmem_res.layer, + // Use the more restrictive LSN floor + lsn_floor: std::cmp::max(persistent_res.lsn_floor, inmem_res.lsn_floor), + }) + } } } pub fn range_search(&self, key_range: Range, end_lsn: Lsn) -> RangeSearchResult { + let in_memory_layer = self.search_in_memory_layer(end_lsn); + let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { Some(version) => version, None => { - let mut result = RangeSearchResult::new(); - result.not_found.add_range(key_range); - return result; + return RangeSearchResult::map_to_in_memory_layer(in_memory_layer, key_range); } }; @@ -464,7 +606,13 @@ impl LayerMap { let delta_changes = version.delta_coverage.range_overlaps(&raw_range); let image_changes = version.image_coverage.range_overlaps(&raw_range); - let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes); + let collector = RangeSearchCollector::new( + key_range, + end_lsn, + in_memory_layer, + delta_changes, + image_changes, + ); collector.collect() } @@ -571,17 +719,36 @@ impl LayerMap { } /// Get a ref counted pointer for the first in memory layer that matches the provided predicate. - pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option> - where - Pred: FnMut(&Arc) -> bool, - { + pub(crate) fn search_in_memory_layer(&self, below: Lsn) -> Option { + let is_below = |l: &Arc| { + let start_lsn = l.get_lsn_range().start; + below > start_lsn + }; + if let Some(open) = &self.open_layer { - if pred(open) { - return Some(open.clone()); + if is_below(open) { + return Some(InMemoryLayerDesc { + handle: InMemoryLayerHandle::Open, + lsn_range: open.get_lsn_range(), + }); } } - self.frozen_layers.iter().rfind(|l| pred(l)).cloned() + self.frozen_layers + .iter() + .enumerate() + .rfind(|(_idx, l)| is_below(l)) + .map(|(idx, l)| InMemoryLayerDesc { + handle: InMemoryLayerHandle::Frozen(idx), + lsn_range: l.get_lsn_range(), + }) + } + + pub(crate) fn in_memory_layer(&self, desc: &InMemoryLayerDesc) -> Arc { + match desc.handle { + InMemoryLayerHandle::Open => self.open_layer.as_ref().unwrap().clone(), + InMemoryLayerHandle::Frozen(idx) => self.frozen_layers[idx].clone(), + } } /// @@ -737,136 +904,6 @@ impl LayerMap { max_stacked_deltas } - /// Count how many reimage-worthy layers we need to visit for given key-lsn pair. - /// - /// The `partition_range` argument is used as context for the reimage-worthiness decision. - /// - /// Used as a helper for correctness checks only. Performance not critical. - pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range) -> usize { - match self.search(key, lsn) { - Some(search_result) => { - if search_result.layer.is_incremental() { - (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize) - + self.get_difficulty(search_result.lsn_floor, key, partition_range) - } else { - 0 - } - } - None => 0, - } - } - - /// Used for correctness checking. Results are expected to be identical to - /// self.get_difficulty_map. Assumes self.search is correct. - pub fn get_difficulty_map_bruteforce( - &self, - lsn: Lsn, - partitioning: &KeyPartitioning, - ) -> Vec { - // Looking at the difficulty as a function of key, it could only increase - // when a delta layer starts or an image layer ends. Therefore it's sufficient - // to check the difficulties at: - // - the key.start for each non-empty part range - // - the key.start for each delta - // - the key.end for each image - let keys_iter: Box> = { - let mut keys: Vec = self - .iter_historic_layers() - .map(|layer| { - if layer.is_incremental() { - layer.get_key_range().start - } else { - layer.get_key_range().end - } - }) - .collect(); - keys.sort(); - Box::new(keys.into_iter()) - }; - let mut keys_iter = keys_iter.peekable(); - - // Iter the partition and keys together and query all the necessary - // keys, computing the max difficulty for each part. - partitioning - .parts - .iter() - .map(|part| { - let mut difficulty = 0; - // Partition ranges are assumed to be sorted and disjoint - // TODO assert it - for range in &part.ranges { - if !range.is_empty() { - difficulty = - std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range)); - } - while let Some(key) = keys_iter.peek() { - if key >= &range.end { - break; - } - let key = keys_iter.next().unwrap(); - if key < range.start { - continue; - } - difficulty = - std::cmp::max(difficulty, self.get_difficulty(lsn, key, range)); - } - } - difficulty - }) - .collect() - } - - /// For each part of a keyspace partitioning, return the maximum number of layers - /// that would be needed for page reconstruction in that part at the given LSN. - /// - /// If `limit` is provided we don't try to count above that number. - /// - /// This method is used to decide where to create new image layers. Computing the - /// result for the entire partitioning at once allows this function to be more - /// efficient, and further optimization is possible by using iterators instead, - /// to allow early return. - /// - /// TODO actually use this method instead of count_deltas. Currently we only use - /// it for benchmarks. - pub fn get_difficulty_map( - &self, - lsn: Lsn, - partitioning: &KeyPartitioning, - limit: Option, - ) -> Vec { - // TODO This is a naive implementation. Perf improvements to do: - // 1. Instead of calling self.image_coverage and self.count_deltas, - // iterate the image and delta coverage only once. - partitioning - .parts - .iter() - .map(|part| { - let mut difficulty = 0; - for range in &part.ranges { - if limit == Some(difficulty) { - break; - } - for (img_range, last_img) in self.image_coverage(range, lsn) { - if limit == Some(difficulty) { - break; - } - let img_lsn = if let Some(last_img) = last_img { - last_img.get_lsn_range().end - } else { - Lsn(0) - }; - - if img_lsn < lsn { - let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit); - difficulty = std::cmp::max(difficulty, num_deltas); - } - } - } - difficulty - }) - .collect() - } - /// Return all L0 delta layers pub fn level0_deltas(&self) -> &Vec> { &self.l0_delta_layers @@ -1066,18 +1103,20 @@ impl LayerMap { #[cfg(test)] mod tests { - use crate::tenant::{storage_layer::LayerName, IndexPart}; - use pageserver_api::{ - key::DBDIR_KEY, - keyspace::{KeySpace, KeySpaceRandomAccum}, - }; - use std::{collections::HashMap, path::PathBuf}; - use utils::{ - id::{TenantId, TimelineId}, - shard::TenantShardId, + use std::collections::HashMap; + use std::path::PathBuf; + + use crate::{ + DEFAULT_PG_VERSION, + tenant::{harness::TenantHarness, storage_layer::LayerName}, }; + use pageserver_api::key::DBDIR_KEY; + use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; + use utils::id::{TenantId, TimelineId}; + use utils::shard::TenantShardId; use super::*; + use crate::tenant::IndexPart; #[derive(Clone)] struct LayerDesc { @@ -1102,7 +1141,6 @@ mod tests { } fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) { - assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace()); let lhs: HashMap = lhs .found .into_iter() @@ -1128,17 +1166,12 @@ mod tests { let mut key = key_range.start; while key != key_range.end { let res = layer_map.search(key, end_lsn); - match res { - Some(res) => { - range_search_result - .found - .entry(res) - .or_default() - .add_key(key); - } - None => { - range_search_result.not_found.add_key(key); - } + if let Some(res) = res { + range_search_result + .found + .entry(res) + .or_default() + .add_key(key); } key = key.next(); @@ -1153,20 +1186,49 @@ mod tests { let range = Key::from_i128(100)..Key::from_i128(200); let res = layer_map.range_search(range.clone(), Lsn(100)); - assert_eq!( - res.not_found.to_keyspace(), - KeySpace { - ranges: vec![range] - } - ); + assert_range_search_result_eq(res, RangeSearchResult::new()); } - #[test] - fn ranged_search() { + #[tokio::test] + async fn ranged_search() { + let harness = TenantHarness::create("ranged_search").await.unwrap(); + let (tenant, ctx) = harness.load().await; + let timeline_id = TimelineId::generate(); + // Create the timeline such that the in-memory layers can be written + // to the timeline directory. + tenant + .create_test_timeline(timeline_id, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let gate = utils::sync::gate::Gate::default(); + let add_in_memory_layer = async |layer_map: &mut LayerMap, lsn_range: Range| { + let layer = InMemoryLayer::create( + harness.conf, + timeline_id, + harness.tenant_shard_id, + lsn_range.start, + &gate, + &ctx, + ) + .await + .unwrap(); + + layer.freeze(lsn_range.end).await; + + layer_map.frozen_layers.push_back(Arc::new(layer)); + }; + + let in_memory_layer_configurations = [ + vec![], + // Overlaps with the top-most image + vec![Lsn(35)..Lsn(50)], + ]; + let layers = vec![ LayerDesc { key_range: Key::from_i128(15)..Key::from_i128(50), - lsn_range: Lsn(0)..Lsn(5), + lsn_range: Lsn(5)..Lsn(6), is_delta: false, }, LayerDesc { @@ -1186,19 +1248,27 @@ mod tests { }, LayerDesc { key_range: Key::from_i128(35)..Key::from_i128(40), - lsn_range: Lsn(35)..Lsn(40), + lsn_range: Lsn(40)..Lsn(41), is_delta: false, }, ]; - let layer_map = create_layer_map(layers.clone()); - for start in 0..60 { - for end in (start + 1)..60 { - let range = Key::from_i128(start)..Key::from_i128(end); - let result = layer_map.range_search(range.clone(), Lsn(100)); - let expected = brute_force_range_search(&layer_map, range, Lsn(100)); + let mut layer_map = create_layer_map(layers.clone()); + for in_memory_layers in in_memory_layer_configurations { + for in_mem_layer_range in in_memory_layers { + add_in_memory_layer(&mut layer_map, in_mem_layer_range).await; + } - assert_range_search_result_eq(result, expected); + for start in 0..60 { + for end in (start + 1)..60 { + let range = Key::from_i128(start)..Key::from_i128(end); + let result = layer_map.range_search(range.clone(), Lsn(100)); + let expected = brute_force_range_search(&layer_map, range, Lsn(100)); + + eprintln!("{start}..{end}: {result:?}"); + + assert_range_search_result_eq(result, expected); + } } } } @@ -1417,9 +1487,11 @@ mod tests { assert!(!shadow.ranges.is_empty()); // At least some layers should be marked covered - assert!(layer_visibilities - .iter() - .any(|i| matches!(i.1, LayerVisibilityHint::Covered))); + assert!( + layer_visibilities + .iter() + .any(|i| matches!(i.1, LayerVisibilityHint::Covered)) + ); let layer_visibilities = layer_visibilities.into_iter().collect::>(); @@ -1489,12 +1561,348 @@ mod tests { // Sanity: the layer that holds latest data for the DBDIR key should always be visible // (just using this key as a key that will always exist for any layermap fixture) - let dbdir_layer = layer_map - .search(DBDIR_KEY, index.metadata.disk_consistent_lsn()) - .unwrap(); + let dbdir_layer = { + let readable_layer = layer_map + .search(DBDIR_KEY, index.metadata.disk_consistent_lsn()) + .unwrap(); + + match readable_layer.layer { + ReadableLayerWeak::PersistentLayer(desc) => desc, + ReadableLayerWeak::InMemoryLayer(_) => unreachable!(""), + } + }; assert!(matches!( - layer_visibilities.get(&dbdir_layer.layer).unwrap(), + layer_visibilities.get(&dbdir_layer).unwrap(), LayerVisibilityHint::Visible )); } } + +#[cfg(test)] +mod select_layer_tests { + use super::*; + + fn create_persistent_layer( + start_lsn: u64, + end_lsn: u64, + is_delta: bool, + ) -> Arc { + if !is_delta { + assert_eq!(end_lsn, start_lsn + 1); + } + + Arc::new(PersistentLayerDesc::new_test( + Key::MIN..Key::MAX, + Lsn(start_lsn)..Lsn(end_lsn), + is_delta, + )) + } + + fn create_inmem_layer(start_lsn: u64, end_lsn: u64) -> InMemoryLayerDesc { + InMemoryLayerDesc { + handle: InMemoryLayerHandle::Open, + lsn_range: Lsn(start_lsn)..Lsn(end_lsn), + } + } + + #[test] + fn test_select_layer_empty() { + assert!(LayerMap::select_layer(None, None, None, Lsn(100)).is_none()); + } + + #[test] + fn test_select_layer_only_delta() { + let delta = create_persistent_layer(10, 20, true); + let result = LayerMap::select_layer(Some(delta.clone()), None, None, Lsn(100)).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + } + + #[test] + fn test_select_layer_only_image() { + let image = create_persistent_layer(10, 11, false); + let result = LayerMap::select_layer(None, Some(image.clone()), None, Lsn(100)).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_only_inmem() { + let inmem = create_inmem_layer(10, 20); + let result = LayerMap::select_layer(None, None, Some(inmem.clone()), Lsn(100)).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + } + + #[test] + fn test_select_layer_image_inside_delta() { + let delta = create_persistent_layer(10, 20, true); + let image = create_persistent_layer(15, 16, false); + + let result = + LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(100)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(16)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + None, + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_newer_image() { + let delta = create_persistent_layer(10, 20, true); + let image = create_persistent_layer(25, 26, false); + + let result = + LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(25)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + + let result = + LayerMap::select_layer(Some(delta.clone()), None, None, result.lsn_floor).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + } + + #[test] + fn test_select_layer_delta_with_older_image() { + let delta = create_persistent_layer(15, 25, true); + let image = create_persistent_layer(10, 11, false); + + let result = + LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = + LayerMap::select_layer(None, Some(image.clone()), None, result.lsn_floor).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_image_inside_inmem() { + let image = create_persistent_layer(15, 16, false); + let inmem = create_inmem_layer(10, 25); + + let result = + LayerMap::select_layer(None, Some(image.clone()), Some(inmem.clone()), Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(16)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + None, + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + + let result = + LayerMap::select_layer(None, None, Some(inmem.clone()), result.lsn_floor).unwrap(); + assert_eq!(result.lsn_floor, Lsn(10)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + } + + #[test] + fn test_select_layer_delta_inside_inmem() { + let delta_top = create_persistent_layer(15, 20, true); + let delta_bottom = create_persistent_layer(10, 15, true); + let inmem = create_inmem_layer(15, 25); + + let result = + LayerMap::select_layer(Some(delta_top.clone()), None, Some(inmem.clone()), Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(20)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + Some(delta_top.clone()), + None, + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_top)) + ); + + let result = LayerMap::select_layer( + Some(delta_bottom.clone()), + None, + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_bottom)) + ); + } + + #[test] + fn test_select_layer_all_overlap_1() { + let inmem = create_inmem_layer(10, 30); + let delta = create_persistent_layer(15, 25, true); + let image = create_persistent_layer(20, 21, false); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + Lsn(50), + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(25)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(21)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(20)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_all_overlap_2() { + let inmem = create_inmem_layer(20, 30); + let delta = create_persistent_layer(10, 40, true); + let image = create_persistent_layer(25, 26, false); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + Lsn(50), + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(26)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(25)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_all_overlap_3() { + let inmem = create_inmem_layer(30, 40); + let delta = create_persistent_layer(10, 30, true); + let image = create_persistent_layer(20, 21, false); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + Lsn(50), + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(30)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + None, + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(21)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + None, + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(20)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } +} diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 136f68bc36..b3dc8e56a3 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -3,9 +3,8 @@ use std::ops::Range; use tracing::info; -use crate::tenant::storage_layer::PersistentLayerDesc; - use super::layer_coverage::LayerCoverageTuple; +use crate::tenant::storage_layer::PersistentLayerDesc; /// Layers in this module are identified and indexed by this data. /// @@ -64,6 +63,8 @@ pub struct HistoricLayerCoverage { /// The latest state head: LayerCoverageTuple, + /// TODO: this could be an ordered vec using binary search. + /// We push into this map everytime we add a layer, so might see some benefit /// All previous states historic: BTreeMap>, } @@ -420,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage { buffer: BTreeMap>, /// All current layers. This is not used for search. Only to make rebuilds easier. + // TODO: This map is never cleared. Rebuilds could use the post-trim last entry of + // [`Self::historic_coverage`] instead of doubling memory usage. + // [`Self::len`]: can require rebuild and serve from latest historic + // [`Self::iter`]: already requires rebuild => can serve from latest historic layers: BTreeMap, } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 15c6955260..dceae89d1c 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -19,8 +19,9 @@ use anyhow::ensure; use serde::{Deserialize, Serialize}; -use utils::bin_ser::SerializeError; -use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; +use utils::bin_ser::{BeSer, SerializeError}; +use utils::id::TimelineId; +use utils::lsn::Lsn; /// Use special format number to enable backward compatibility. const METADATA_FORMAT_VERSION: u16 = 4; @@ -299,9 +300,8 @@ impl TimelineMetadata { /// Returns true if anything was changed pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { - if let Some(ancestor) = self.body.ancestor_timeline { - assert_eq!(ancestor, branchpoint.0); - } + // Detaching from ancestor now doesn't always detach directly to the direct ancestor, but we + // ensure the LSN is the same. So we don't check the timeline ID. if self.body.ancestor_lsn != Lsn(0) { assert_eq!(self.body.ancestor_lsn, branchpoint.1); } @@ -345,9 +345,10 @@ impl TimelineMetadata { } pub(crate) mod modern_serde { - use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; use serde::{Deserialize, Serialize}; + use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; + pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result where D: serde::de::Deserializer<'de>, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 22ee560dbf..f02247950f 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1,34 +1,42 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; -use futures::StreamExt; -use itertools::Itertools; -use pageserver_api::key::Key; -use pageserver_api::models::LocationConfigMode; -use pageserver_api::shard::{ - ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, -}; -use pageserver_api::upcall_api::ReAttachResponseTenant; -use rand::{distributions::Alphanumeric, Rng}; -use remote_storage::TimeoutOrCancel; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::Deref; use std::sync::Arc; use std::time::Duration; -use sysinfo::SystemExt; -use tokio::fs; use anyhow::Context; +use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use itertools::Itertools; use once_cell::sync::Lazy; +use pageserver_api::key::Key; +use pageserver_api::models::{DetachBehavior, LocationConfigMode}; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, +}; +use pageserver_api::upcall_api::ReAttachResponseTenant; +use rand::Rng; +use rand::distributions::Alphanumeric; +use remote_storage::TimeoutOrCancel; +use sysinfo::SystemExt; +use tokio::fs; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; - +use utils::crashsafe::path_with_suffix_extension; +use utils::fs_ext::PathExt; +use utils::generation::Generation; +use utils::id::{TenantId, TimelineId}; use utils::{backoff, completion, crashsafe}; +use super::remote_timeline_client::remote_tenant_path; +use super::secondary::SecondaryTenant; +use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; +use super::{GlobalShutDown, TenantSharedResources}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::controller_upcall_client::{ @@ -37,7 +45,7 @@ use crate::controller_upcall_client::{ use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; -use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, }; @@ -48,16 +56,6 @@ use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Ten use crate::virtual_file::MaybeFatalIo; use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; -use utils::crashsafe::path_with_suffix_extension; -use utils::fs_ext::PathExt; -use utils::generation::Generation; -use utils::id::{TenantId, TimelineId}; - -use super::remote_timeline_client::remote_tenant_path; -use super::secondary::SecondaryTenant; -use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; -use super::{GlobalShutDown, TenantSharedResources}; - /// For a tenant that appears in TenantsMap, it may either be /// - `Attached`: has a full Tenant object, is elegible to service /// reads and ingest WAL. @@ -140,7 +138,7 @@ impl TenantStartupMode { /// If this returns None, the re-attach struct is in an invalid state and /// should be ignored in the response. fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option { - match (rart.mode, rart.gen) { + match (rart.mode, rart.r#gen) { (LocationConfigMode::Detached, _) => None, (LocationConfigMode::Secondary, _) => Some(Self::Secondary), (LocationConfigMode::AttachedMulti, Some(g)) => { @@ -376,7 +374,7 @@ async fn init_load_generations( TenantStartupMode::Attached((_mode, generation)) => Some(generation), TenantStartupMode::Secondary => None, } - .map(|gen| (*id, *gen)) + .map(|gen_| (*id, *gen_)) }) .collect(); resources.deletion_queue_client.recover(attached_tenants)?; @@ -502,7 +500,9 @@ pub async fn init_tenant_mgr( .total_memory(); let max_ephemeral_layer_bytes = conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024); - tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory"); + tracing::info!( + "Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory" + ); inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store( max_ephemeral_layer_bytes, std::sync::atomic::Ordering::Relaxed, @@ -700,10 +700,11 @@ fn tenant_spawn( // to avoid impacting prod runtime performance. assert!(!crate::is_temporary(tenant_path)); debug_assert!(tenant_path.is_dir()); - debug_assert!(conf - .tenant_location_config_path(&tenant_shard_id) - .try_exists() - .unwrap()); + debug_assert!( + conf.tenant_location_config_path(&tenant_shard_id) + .try_exists() + .unwrap() + ); Tenant::spawn( conf, @@ -791,7 +792,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { (total_in_progress, total_attached) } TenantsMap::ShuttingDown(_) => { - error!("already shutting down, this function isn't supposed to be called more than once"); + error!( + "already shutting down, this function isn't supposed to be called more than once" + ); return; } } @@ -1016,9 +1019,9 @@ impl TenantManager { Ok(Ok(_)) => return Ok(Some(tenant)), Err(_) => { tracing::warn!( - timeout_ms = flush_timeout.as_millis(), - "Timed out waiting for flush to remote storage, proceeding anyway." - ) + timeout_ms = flush_timeout.as_millis(), + "Timed out waiting for flush to remote storage, proceeding anyway." + ) } } } @@ -1194,7 +1197,9 @@ impl TenantManager { } TenantSlot::Attached(tenant) => { let (_guard, progress) = utils::completion::channel(); - info!("Shutting down just-spawned tenant, because tenant manager is shut down"); + info!( + "Shutting down just-spawned tenant, because tenant manager is shut down" + ); match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { info!("Finished shutting down just-spawned tenant"); @@ -1784,7 +1789,7 @@ impl TenantManager { _ => { return Err(anyhow::anyhow!(e).context(format!( "Hard linking {relative_layer} into {child_prefix}" - ))) + ))); } } } @@ -1909,6 +1914,7 @@ impl TenantManager { tenant_shard_id: TenantShardId, timeline_id: TimelineId, prepared: PreparedTimelineDetach, + behavior: DetachBehavior, mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, ) -> Result, detach_ancestor::Error> { @@ -1952,7 +1958,14 @@ impl TenantManager { .map_err(Error::NotFound)?; let resp = timeline - .detach_from_ancestor_and_reparent(&tenant, prepared, ctx) + .detach_from_ancestor_and_reparent( + &tenant, + prepared, + attempt.ancestor_timeline_id, + attempt.ancestor_lsn, + behavior, + ctx, + ) .await?; let mut slot_guard = slot_guard; @@ -2025,8 +2038,8 @@ impl TenantManager { .wait_to_become_active(std::time::Duration::from_secs(9999)) .await .map_err(|e| { - use pageserver_api::models::TenantState; use GetActiveTenantError::{Cancelled, WillNotBecomeActive}; + use pageserver_api::models::TenantState; match e { Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => { Error::ShuttingDown @@ -2089,7 +2102,7 @@ impl TenantManager { match selector { ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { - return ShardResolveResult::Found(tenant.clone()) + return ShardResolveResult::Found(tenant.clone()); } ShardSelector::Page(key) => { // First slot we see for this tenant, calculate the expected shard number @@ -2486,7 +2499,7 @@ impl SlotGuard { TenantsMap::Initializing => { return Err(TenantSlotUpsertError::MapState( TenantMapError::StillInitializing, - )) + )); } TenantsMap::ShuttingDown(_) => { return Err(TenantSlotUpsertError::ShuttingDown(( @@ -2815,21 +2828,22 @@ where } } -use { - crate::tenant::gc_result::GcResult, http_utils::error::ApiError, - pageserver_api::models::TimelineGcRequest, -}; +use http_utils::error::ApiError; +use pageserver_api::models::TimelineGcRequest; + +use crate::tenant::gc_result::GcResult; #[cfg(test)] mod tests { use std::collections::BTreeMap; use std::sync::Arc; + use tracing::Instrument; + use super::super::harness::TenantHarness; + use super::TenantsMap; use crate::tenant::mgr::TenantSlot; - use super::{super::harness::TenantHarness, TenantsMap}; - #[tokio::test(start_paused = true)] async fn shutdown_awaits_in_progress_tenant() { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 713efbb9a4..891760b499 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -179,77 +179,64 @@ pub mod index; pub mod manifest; pub(crate) mod upload; -use anyhow::Context; -use camino::Utf8Path; -use chrono::{NaiveDateTime, Utc}; - -pub(crate) use download::download_initdb_tar_zst; -use pageserver_api::models::TimelineArchivalState; -use pageserver_api::shard::{ShardIndex, TenantShardId}; -use regex::Regex; -use scopeguard::ScopeGuard; -use tokio_util::sync::CancellationToken; -use utils::backoff::{ - self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use utils::pausable_failpoint; -use utils::shard::ShardNumber; - use std::collections::{HashMap, HashSet, VecDeque}; +use std::ops::DerefMut; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::time::Duration; +use anyhow::Context; +use camino::Utf8Path; +use chrono::{NaiveDateTime, Utc}; +pub(crate) use download::{ + download_index_part, download_initdb_tar_zst, download_tenant_manifest, is_temp_download_file, + list_remote_tenant_shards, list_remote_timelines, +}; +use index::GcCompactionState; +pub(crate) use index::LayerFileMetadata; +use pageserver_api::models::{RelSizeMigration, TimelineArchivalState}; +use pageserver_api::shard::{ShardIndex, TenantShardId}; +use regex::Regex; use remote_storage::{ DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel, }; -use std::ops::DerefMut; -use tracing::{debug, error, info, instrument, warn}; -use tracing::{info_span, Instrument}; -use utils::lsn::Lsn; - -use crate::context::RequestContext; -use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; -use crate::metrics::{ - MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, - RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, - REMOTE_ONDEMAND_DOWNLOADED_LAYERS, +use scopeguard::ScopeGuard; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; +pub(crate) use upload::upload_initdb_dir; +use utils::backoff::{ + self, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff, }; -use crate::task_mgr::shutdown_token; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::remote_timeline_client::download::download_retry; -use crate::tenant::storage_layer::AsLayerDesc; -use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable}; -use crate::tenant::TIMELINES_SEGMENT_NAME; -use crate::{ - config::PageServerConf, - task_mgr, - task_mgr::TaskKind, - task_mgr::BACKGROUND_RUNTIME, - tenant::metadata::TimelineMetadata, - tenant::upload_queue::{ - UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, - }, - TENANT_HEATMAP_BASENAME, -}; - use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::pausable_failpoint; +use utils::shard::ShardNumber; use self::index::IndexPart; - use super::config::AttachedLocationConfig; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::timeline::import_pgdata; use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::{DeleteTimelineError, Generation}; - -pub(crate) use download::{ - download_index_part, download_tenant_manifest, is_temp_download_file, - list_remote_tenant_shards, list_remote_timelines, +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; +use crate::metrics::{ + MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, + RemoteTimelineClientMetricsCallTrackSize, }; -pub(crate) use index::LayerFileMetadata; -pub(crate) use upload::upload_initdb_dir; +use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::download::download_retry; +use crate::tenant::storage_layer::AsLayerDesc; +use crate::tenant::upload_queue::{ + Delete, OpType, UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, + UploadQueueStoppedDeletable, UploadTask, +}; +use crate::tenant::{TIMELINES_SEGMENT_NAME, debug_assert_current_span_has_tenant_and_timeline_id}; +use crate::{TENANT_HEATMAP_BASENAME, task_mgr}; // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a download fails, we log it at info-level, and retry. @@ -450,9 +437,13 @@ impl RemoteTimelineClient { /// Initialize the upload queue for the case where the remote storage is empty, /// i.e., it doesn't have an `IndexPart`. + /// + /// `rel_size_v2_status` needs to be carried over during branching, and that's why + /// it's passed in here. pub fn init_upload_queue_for_empty_remote( &self, local_metadata: &TimelineMetadata, + rel_size_v2_status: Option, ) -> anyhow::Result<()> { // Set the maximum number of inprogress tasks to the remote storage concurrency. There's // certainly no point in starting more upload tasks than this. @@ -462,7 +453,9 @@ impl RemoteTimelineClient { .as_ref() .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; + let initialized_queue = + upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; + initialized_queue.dirty.rel_size_migration = rel_size_v2_status; self.update_remote_physical_size_gauge(None); info!("initialized upload queue as empty"); Ok(()) @@ -913,6 +906,33 @@ impl RemoteTimelineClient { Ok(()) } + /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field. + pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( + self: &Arc, + gc_compaction_state: GcCompactionState, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.gc_compaction = Some(gc_compaction_state); + self.schedule_index_upload(upload_queue); + Ok(()) + } + + /// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field. + pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update( + self: &Arc, + rel_size_v2_status: RelSizeMigration, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status); + // TODO: allow this operation to bypass the validation check because we might upload the index part + // with no layers but the flag updated. For now, we just modify the index part in memory and the next + // upload will include the flag. + // self.schedule_index_upload(upload_queue); + Ok(()) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -934,6 +954,14 @@ impl RemoteTimelineClient { Ok(()) } + /// Only used in the `patch_index_part` HTTP API to force trigger an index upload. + pub fn force_schedule_index_upload(self: &Arc) -> Result<(), NotInitialized> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + self.schedule_index_upload(upload_queue); + Ok(()) + } + /// Launch an index-file upload operation in the background (internal function) fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); @@ -1078,7 +1106,11 @@ impl RemoteTimelineClient { if !wanted(x) && wanted(y) { // this could be avoided by having external in-memory synchronization, like // timeline detach ancestor - warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason"); + warn!( + ?reason, + op = "insert", + "unexpected: two racing processes to enable and disable a gc blocking reason" + ); } // at this point, the metadata must always show that there is a parent @@ -1132,7 +1164,11 @@ impl RemoteTimelineClient { (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), (x, y) => { if !wanted(x) && wanted(y) { - warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)"); + warn!( + ?reason, + op = "remove", + "unexpected: two racing processes to enable and disable a gc blocking reason (remove)" + ); } upload_queue.dirty.gc_blocking = @@ -1274,12 +1310,14 @@ impl RemoteTimelineClient { #[cfg(feature = "testing")] for (name, metadata) in &with_metadata { - let gen = metadata.generation; - if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) { - if unexpected == gen { + let gen_ = metadata.generation; + if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen_) { + if unexpected == gen_ { tracing::error!("{name} was unlinked twice with same generation"); } else { - tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}"); + tracing::error!( + "{name} was unlinked twice with different generations {gen_:?} and {unexpected:?}" + ); } } } @@ -1341,11 +1379,11 @@ impl RemoteTimelineClient { #[cfg(feature = "testing")] for (name, meta) in &with_metadata { - let gen = meta.generation; + let gen_ = meta.generation; match upload_queue.dangling_files.remove(name) { - Some(same) if same == gen => { /* expected */ } + Some(same) if same == gen_ => { /* expected */ } Some(other) => { - tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}"); + tracing::error!("{name} was unlinked with {other:?} but deleted with {gen_:?}"); } None => { tracing::error!("{name} was unlinked but was not dangling"); @@ -1442,7 +1480,9 @@ impl RemoteTimelineClient { // proper stop is yet to be called. On cancel the original or some later task must call // `stop` or `shutdown`. let sg = scopeguard::guard((), |_| { - tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error") + tracing::error!( + "RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error" + ) }); let fut = { @@ -1458,7 +1498,7 @@ impl RemoteTimelineClient { scopeguard::ScopeGuard::into_inner(sg); return; } - UploadQueue::Initialized(ref mut init) => init, + UploadQueue::Initialized(init) => init, }; // if the queue is already stuck due to a shutdown operation which was cancelled, then @@ -1818,7 +1858,9 @@ impl RemoteTimelineClient { .map(|n| n.starts_with(IndexPart::FILE_NAME)) .unwrap_or(false) }) - .filter_map(|o| parse_remote_index_path(o.key.clone()).map(|gen| (o.key.clone(), gen))) + .filter_map(|o| { + parse_remote_index_path(o.key.clone()).map(|gen_| (o.key.clone(), gen_)) + }) .max_by_key(|i| i.1) .map(|i| i.0.clone()) .unwrap_or( @@ -2010,7 +2052,7 @@ impl RemoteTimelineClient { } let upload_result: anyhow::Result<()> = match &task.op { - UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => { + UploadOp::UploadLayer(layer, layer_metadata, mode) => { // TODO: check if this mechanism can be removed now that can_bypass() performs // conflict checks during scheduling. if let Some(OpType::FlushDeletion) = mode { @@ -2100,7 +2142,7 @@ impl RemoteTimelineClient { ) .await } - UploadOp::UploadMetadata { ref uploaded } => { + UploadOp::UploadMetadata { uploaded } => { let res = upload::upload_index_part( &self.storage_impl, &self.tenant_shard_id, @@ -2216,11 +2258,11 @@ impl RemoteTimelineClient { let lsn_update = { let mut upload_queue_guard = self.upload_queue.lock().unwrap(); let upload_queue = match upload_queue_guard.deref_mut() { - UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"), - UploadQueue::Stopped(_stopped) => { - None - }, - UploadQueue::Initialized(qi) => { Some(qi) } + UploadQueue::Uninitialized => panic!( + "callers are responsible for ensuring this is only called on an initialized queue" + ), + UploadQueue::Stopped(_stopped) => None, + UploadQueue::Initialized(qi) => Some(qi), }; let upload_queue = match upload_queue { @@ -2242,7 +2284,11 @@ impl RemoteTimelineClient { let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id); let monotone = is_later || last_updater.is_none(); - assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id); + assert!( + monotone, + "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", + task.task_id + ); // not taking ownership is wasteful upload_queue.clean.0.clone_from(uploaded); @@ -2641,20 +2687,16 @@ pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option #[cfg(test)] mod tests { - use super::*; - use crate::{ - context::RequestContext, - tenant::{ - config::AttachmentMode, - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::layer::local_layer_path, - Tenant, Timeline, - }, - DEFAULT_PG_VERSION, - }; - use std::collections::HashSet; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::context::RequestContext; + use crate::tenant::config::AttachmentMode; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::layer::local_layer_path; + use crate::tenant::{Tenant, Timeline}; + pub(super) fn dummy_contents(name: &str) -> Vec { format!("contents for {name}").into() } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index b4d45dca75..0001f67c99 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -8,41 +8,39 @@ use std::future::Future; use std::str::FromStr; use std::time::SystemTime; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::TenantShardId; +use remote_storage::{ + DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, +}; use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::warn; -use utils::backoff; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::{TenantId, TimelineId}; +use utils::{backoff, pausable_failpoint}; +use super::index::{IndexPart, LayerFileMetadata}; +use super::manifest::TenantManifest; +use super::{ + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, parse_remote_index_path, + parse_remote_tenant_manifest_path, remote_index_path, remote_initdb_archive_path, + remote_initdb_preserved_archive_path, remote_tenant_manifest_path, + remote_tenant_manifest_prefix, remote_tenant_path, +}; +use crate::TEMP_FILE_SUFFIX; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id, }; +use crate::tenant::Generation; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; -use crate::tenant::Generation; -use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; -use crate::TEMP_FILE_SUFFIX; -use remote_storage::{ - DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, -}; -use utils::crashsafe::path_with_suffix_extension; -use utils::id::{TenantId, TimelineId}; -use utils::pausable_failpoint; - -use super::index::{IndexPart, LayerFileMetadata}; -use super::manifest::TenantManifest; -use super::{ - parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path, - remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path, - remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, INITDB_PATH, -}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error}; /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that @@ -207,9 +205,9 @@ async fn download_object( } #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { - use crate::virtual_file::owned_buffers_io; - use crate::virtual_file::IoBufferMut; use std::sync::Arc; + + use crate::virtual_file::{IoBufferMut, owned_buffers_io}; async { let destination_file = Arc::new( VirtualFile::create(dst_path, ctx) @@ -231,6 +229,7 @@ async fn download_object( || IoBufferMut::with_capacity(super::BUFFER_SIZE), gate.enter().map_err(|_| DownloadError::Cancelled)?, ctx, + tracing::info_span!(parent: None, "download_object_buffered_writer", %dst_path), ); // TODO: use vectored write (writev) once supported by tokio-epoll-uring. diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index b8b18005fd..16c38be907 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -7,16 +7,17 @@ use std::collections::HashMap; use chrono::NaiveDateTime; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::RelSizeMigration; +use pageserver_api::shard::ShardIndex; use serde::{Deserialize, Serialize}; +use utils::id::TimelineId; +use utils::lsn::Lsn; use super::is_same_remote_layer_path; +use crate::tenant::Generation; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::import_pgdata; -use crate::tenant::Generation; -use pageserver_api::shard::ShardIndex; -use utils::id::TimelineId; -use utils::lsn::Lsn; /// In-memory representation of an `index_part.json` file /// @@ -85,24 +86,36 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) rel_size_migration: Option, - /// The LSN of gc-compaction horizon. Once gc-compaction is finished for all layer files below an LSN, this LSN will be updated. + /// Not used anymore -- kept here for backwards compatibility. Merged into the `gc_compaction` field. #[serde(skip_serializing_if = "Option::is_none", default)] - pub(crate) l2_lsn: Option, + l2_lsn: Option, + + /// State for the garbage-collecting compaction pass. + /// + /// Garbage-collecting compaction (gc-compaction) prunes `Value`s that are outside + /// the PITR window and not needed by child timelines. + /// + /// A commonly used synonym for this compaction pass is + /// "bottommost-compaction" because the affected LSN range + /// is the "bottom" of the (key,lsn) map. + /// + /// Gc-compaction is a quite expensive operation; that's why we use + /// trigger condition. + /// This field here holds the state pertaining to that trigger condition + /// and (in future) to the progress of the gc-compaction, so that it's + /// resumable across restarts & migrations. + /// + /// Note that the underlying algorithm is _also_ called `gc-compaction` + /// in most places & design docs; but in fact it is more flexible than + /// just the specific use case here; it needs a new name. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) gc_compaction: Option, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub enum RelSizeMigration { - /// The tenant is using the old rel_size format. - /// Note that this enum is persisted as `Option` in the index part, so - /// `None` is the same as `Some(RelSizeMigration::Legacy)`. - Legacy, - /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are - /// persisted in the index part. The read path will read both formats and merge them. - Migrating, - /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted - /// in the index part, and the read path will not read the old format. - Migrated, +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +pub struct GcCompactionState { + /// The upper bound of the last completed garbage-collecting compaction, aka. L2 LSN. + pub(crate) last_completed_lsn: Lsn, } impl IndexPart { @@ -123,10 +136,11 @@ impl IndexPart { /// - 10: +import_pgdata /// - 11: +rel_size_migration /// - 12: +l2_lsn - const LATEST_VERSION: usize = 12; + /// - 13: +gc_compaction + const LATEST_VERSION: usize = 13; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -144,6 +158,7 @@ impl IndexPart { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, } } @@ -406,10 +421,12 @@ impl GcBlocking { #[cfg(test)] mod tests { - use super::*; use std::str::FromStr; + use utils::id::TimelineId; + use super::*; + #[test] fn v1_indexpart_is_parsed() { let example = r#"{ @@ -450,6 +467,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -497,6 +515,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -545,6 +564,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -596,6 +616,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -642,6 +663,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -691,6 +713,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -745,6 +768,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -804,6 +828,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -864,6 +889,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -929,6 +955,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1007,6 +1034,7 @@ mod tests { }))), rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1086,6 +1114,7 @@ mod tests { }))), rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1093,7 +1122,7 @@ mod tests { } #[test] - fn v12_l2_lsn_is_parsed() { + fn v12_v13_l2_gc_ompaction_is_parsed() { let example = r#"{ "version": 12, "layer_metadata":{ @@ -1124,7 +1153,10 @@ mod tests { } }, "rel_size_migration": "legacy", - "l2_lsn": "0/16960E8" + "l2_lsn": "0/16960E8", + "gc_compaction": { + "last_completed_lsn": "0/16960E8" + } }"#; let expected = IndexPart { @@ -1166,6 +1198,9 @@ mod tests { }))), rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: Some("0/16960E8".parse::().unwrap()), + gc_compaction: Some(GcCompactionState { + last_completed_lsn: "0/16960E8".parse::().unwrap(), + }), }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index 2029847a12..543ccc219d 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -1,6 +1,7 @@ use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; -use utils::{id::TimelineId, lsn::Lsn}; +use utils::id::TimelineId; +use utils::lsn::Lsn; /// Tenant-shard scoped manifest #[derive(Clone, Serialize, Deserialize, PartialEq, Eq)] diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index af4dbbbfb6..7d9f47665a 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -1,28 +1,28 @@ //! Helper functions to upload files to remote storage with a RemoteStorage -use anyhow::{bail, Context}; +use std::io::{ErrorKind, SeekFrom}; +use std::time::SystemTime; + +use anyhow::{Context, bail}; use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; -use std::io::{ErrorKind, SeekFrom}; -use std::time::SystemTime; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use tokio::fs::{self, File}; use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; +use tracing::info; +use utils::id::{TenantId, TimelineId}; use utils::{backoff, pausable_failpoint}; +use super::Generation; use super::index::IndexPart; use super::manifest::TenantManifest; -use super::Generation; use crate::tenant::remote_timeline_client::{ remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path, }; -use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; -use utils::id::{TenantId, TimelineId}; - -use tracing::info; /// Serializes and uploads the given index part data to the remote storage. pub(crate) async fn upload_index_part( @@ -134,7 +134,9 @@ pub(super) async fn upload_timeline_layer<'a>( .len(); if metadata_size != fs_size { - bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); + bail!( + "File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}" + ); } let fs_size = usize::try_from(fs_size) diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 4bc208331b..8f8622c796 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -3,40 +3,31 @@ pub mod heatmap; mod heatmap_uploader; mod scheduler; -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; -use crate::{ - context::RequestContext, - disk_usage_eviction_task::DiskUsageEvictionInfo, - metrics::SECONDARY_HEATMAP_TOTAL_SIZE, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, -}; - -use self::{ - downloader::{downloader_task, SecondaryDetail}, - heatmap_uploader::heatmap_uploader_task, -}; - -use super::{ - config::{SecondaryLocationConfig, TenantConfOpt}, - mgr::TenantManager, - span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerName, - GetTenantError, -}; - -use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE; use metrics::UIntGauge; -use pageserver_api::{ - models, - shard::{ShardIdentity, TenantShardId}, -}; +use pageserver_api::models; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use remote_storage::GenericRemoteStorage; - use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::instrument; -use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; +use utils::completion::Barrier; +use utils::id::TimelineId; +use utils::sync::gate::Gate; + +use self::downloader::{SecondaryDetail, downloader_task}; +use self::heatmap_uploader::heatmap_uploader_task; +use super::GetTenantError; +use super::config::{SecondaryLocationConfig, TenantConfOpt}; +use super::mgr::TenantManager; +use super::span::debug_assert_current_span_has_tenant_id; +use super::storage_layer::LayerName; +use crate::context::RequestContext; +use crate::disk_usage_eviction_task::DiskUsageEvictionInfo; +use crate::metrics::{SECONDARY_HEATMAP_TOTAL_SIZE, SECONDARY_RESIDENT_PHYSICAL_SIZE}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; enum DownloadCommand { Download(TenantShardId), diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 2e8c3946bd..1cf0241631 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -1,47 +1,8 @@ -use std::{ - collections::{HashMap, HashSet}, - pin::Pin, - str::FromStr, - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use crate::{ - config::PageServerConf, - context::RequestContext, - disk_usage_eviction_task::{ - finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, - }, - metrics::SECONDARY_MODE, - tenant::{ - config::SecondaryLocationConfig, - debug_assert_current_span_has_tenant_and_timeline_id, - ephemeral_file::is_ephemeral_file, - remote_timeline_client::{ - index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - }, - span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint}, - tasks::{warn_when_period_overrun, BackgroundLoopKind}, - }, - virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, - TEMP_FILE_SUFFIX, -}; - -use super::{ - heatmap::HeatMapLayer, - scheduler::{ - self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, - TenantBackgroundJobs, - }, - GetTenantError, SecondaryTenant, SecondaryTenantError, -}; - -use crate::tenant::{ - mgr::TenantManager, - remote_timeline_client::{download::download_layer_file, remote_heatmap_path}, -}; +use std::collections::{HashMap, HashSet}; +use std::pin::Pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant, SystemTime}; use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; @@ -50,18 +11,43 @@ use metrics::UIntGauge; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage}; - use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, warn, Instrument}; -use utils::{ - backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, pausable_failpoint, serde_system_time, -}; +use tracing::{Instrument, info_span, instrument, warn}; +use utils::completion::Barrier; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::TimelineId; +use utils::{backoff, failpoint_support, fs_ext, pausable_failpoint, serde_system_time}; -use super::{ - heatmap::{HeatMapTenant, HeatMapTimeline}, - CommandRequest, DownloadCommand, +use super::heatmap::{HeatMapLayer, HeatMapTenant, HeatMapTimeline}; +use super::scheduler::{ + self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs, period_jitter, + period_warmup, }; +use super::{ + CommandRequest, DownloadCommand, GetTenantError, SecondaryTenant, SecondaryTenantError, +}; +use crate::TEMP_FILE_SUFFIX; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::disk_usage_eviction_task::{ + DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, finite_f32, +}; +use crate::metrics::SECONDARY_MODE; +use crate::tenant::config::SecondaryLocationConfig; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::ephemeral_file::is_ephemeral_file; +use crate::tenant::mgr::TenantManager; +use crate::tenant::remote_timeline_client::download::download_layer_file; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::remote_timeline_client::{ + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, is_temp_download_file, + remote_heatmap_path, +}; +use crate::tenant::span::debug_assert_current_span_has_tenant_id; +use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::storage_layer::{LayerName, LayerVisibilityHint}; +use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error}; /// For each tenant, default period for how long must have passed since the last download_tenant call before /// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first @@ -505,7 +491,10 @@ impl JobGenerator TenantDownloader<'a> { // Download the layers in the heatmap for timeline in heatmap.timelines { + let ctx = &ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline.timeline_id); let timeline_state = timeline_states .remove(&timeline.timeline_id) .expect("Just populated above"); @@ -883,8 +873,7 @@ impl<'a> TenantDownloader<'a> { let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap(); let layers_in_heatmap = heatmap_timeline - .layers - .iter() + .hot_layers() .map(|l| (&l.name, l.metadata.generation)) .collect::>(); let layers_on_disk = timeline_state @@ -1029,7 +1018,8 @@ impl<'a> TenantDownloader<'a> { // Accumulate updates to the state let mut touched = Vec::new(); - for layer in timeline.layers { + let timeline_id = timeline.timeline_id; + for layer in timeline.into_hot_layers() { if self.secondary_state.cancel.is_cancelled() { tracing::debug!("Cancelled -- dropping out of layer loop"); return (Err(UpdateError::Cancelled), touched); @@ -1054,7 +1044,7 @@ impl<'a> TenantDownloader<'a> { } match self - .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx) + .download_layer(tenant_shard_id, &timeline_id, layer, ctx) .await { Ok(Some(layer)) => touched.push(layer), @@ -1162,7 +1152,7 @@ impl<'a> TenantDownloader<'a> { let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); let timeline_id = timeline.timeline_id; - tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count()); let (result, touched) = self .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx) @@ -1330,11 +1320,11 @@ async fn init_timeline_state( // As we iterate through layers found on disk, we will look up their metadata from this map. // Layers not present in metadata will be discarded. let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = - heatmap.layers.iter().map(|l| (&l.name, l)).collect(); + heatmap.hot_layers().map(|l| (&l.name, l)).collect(); let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = if let Some(last_heatmap) = last_heatmap { - last_heatmap.layers.iter().map(|l| (&l.name, l)).collect() + last_heatmap.hot_layers().map(|l| (&l.name, l)).collect() } else { HashMap::new() }; diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 0fa10ca294..6dbb3f091f 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,11 +1,13 @@ -use std::{collections::HashMap, time::SystemTime}; - -use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName}; +use std::collections::HashMap; +use std::time::SystemTime; use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; +use serde_with::{DisplayFromStr, TimestampSeconds, serde_as}; +use utils::generation::Generation; +use utils::id::TimelineId; -use utils::{generation::Generation, id::TimelineId}; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::storage_layer::LayerName; #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapTenant { @@ -40,7 +42,7 @@ pub(crate) struct HeatMapTimeline { #[serde_as(as = "DisplayFromStr")] pub(crate) timeline_id: TimelineId, - pub(crate) layers: Vec, + layers: Vec, } #[serde_as] @@ -51,8 +53,10 @@ pub(crate) struct HeatMapLayer { #[serde_as(as = "TimestampSeconds")] pub(crate) access_time: SystemTime, - // TODO: an actual 'heat' score that would let secondary locations prioritize downloading - // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. + + #[serde(default)] + pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading + // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. } impl HeatMapLayer { @@ -60,11 +64,13 @@ impl HeatMapLayer { name: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, + cold: bool, ) -> Self { Self { name, metadata, access_time, + cold, } } } @@ -76,6 +82,18 @@ impl HeatMapTimeline { layers, } } + + pub(crate) fn into_hot_layers(self) -> impl Iterator { + self.layers.into_iter().filter(|l| !l.cold) + } + + pub(crate) fn hot_layers(&self) -> impl Iterator { + self.layers.iter().filter(|l| !l.cold) + } + + pub(crate) fn all_layers(&self) -> impl Iterator { + self.layers.iter() + } } pub(crate) struct HeatMapStats { @@ -90,7 +108,7 @@ impl HeatMapTenant { layers: 0, }; for timeline in &self.timelines { - for layer in &timeline.layers { + for layer in timeline.hot_layers() { stats.layers += 1; stats.bytes += layer.metadata.file_size; } diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index d72c337369..3375714a66 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -1,42 +1,33 @@ -use std::{ - collections::HashMap, - pin::Pin, - sync::{Arc, Weak}, - time::{Duration, Instant}, -}; - -use crate::{ - metrics::SECONDARY_MODE, - tenant::{ - config::AttachmentMode, - mgr::{GetTenantError, TenantManager}, - remote_timeline_client::remote_heatmap_path, - span::debug_assert_current_span_has_tenant_id, - tasks::{warn_when_period_overrun, BackgroundLoopKind}, - Tenant, - }, - virtual_file::VirtualFile, - TEMP_FILE_SUFFIX, -}; +use std::collections::HashMap; +use std::pin::Pin; +use std::sync::{Arc, Weak}; +use std::time::{Duration, Instant}; use futures::Future; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; - -use super::{ - heatmap::HeatMapTenant, - scheduler::{ - self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, - TenantBackgroundJobs, - }, - CommandRequest, SecondaryTenantError, UploadCommand, -}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, Instrument}; -use utils::{ - backoff, completion::Barrier, crashsafe::path_with_suffix_extension, - yielding_loop::yielding_loop, +use tracing::{Instrument, info_span, instrument}; +use utils::backoff; +use utils::completion::Barrier; +use utils::crashsafe::path_with_suffix_extension; +use utils::yielding_loop::yielding_loop; + +use super::heatmap::HeatMapTenant; +use super::scheduler::{ + self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs, period_jitter, + period_warmup, }; +use super::{CommandRequest, SecondaryTenantError, UploadCommand}; +use crate::TEMP_FILE_SUFFIX; +use crate::metrics::SECONDARY_MODE; +use crate::tenant::Tenant; +use crate::tenant::config::AttachmentMode; +use crate::tenant::mgr::{GetTenantError, TenantManager}; +use crate::tenant::remote_timeline_client::remote_heatmap_path; +use crate::tenant::span::debug_assert_current_span_has_tenant_id; +use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun}; +use crate::virtual_file::VirtualFile; pub(super) async fn heatmap_uploader_task( tenant_manager: Arc, diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index e963c722b9..f948f9114f 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -1,16 +1,15 @@ -use futures::Future; -use rand::Rng; -use std::{ - collections::HashMap, - marker::PhantomData, - pin::Pin, - time::{Duration, Instant}, -}; +use std::collections::HashMap; +use std::marker::PhantomData; +use std::pin::Pin; +use std::time::{Duration, Instant}; +use futures::Future; use pageserver_api::shard::TenantShardId; +use rand::Rng; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use utils::{completion::Barrier, yielding_loop::yielding_loop}; +use utils::completion::Barrier; +use utils::yielding_loop::yielding_loop; use super::{CommandRequest, CommandResponse, SecondaryTenantError}; diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 1e84a9d9dc..8cc94b4e4d 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -4,21 +4,18 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use tenant_size_model::svg::SvgBranchKind; -use tokio::sync::oneshot::error::RecvError; +use tenant_size_model::{Segment, StorageModel}; use tokio::sync::Semaphore; +use tokio::sync::oneshot::error::RecvError; use tokio_util::sync::CancellationToken; - -use crate::context::RequestContext; -use crate::pgdatadir_mapping::CalculateLogicalSizeError; - -use super::{GcError, LogicalSizeCalculationCause, Tenant}; -use crate::tenant::{MaybeOffloaded, Timeline}; +use tracing::*; use utils::id::TimelineId; use utils::lsn::Lsn; -use tracing::*; - -use tenant_size_model::{Segment, StorageModel}; +use super::{GcError, LogicalSizeCalculationCause, Tenant}; +use crate::context::RequestContext; +use crate::pgdatadir_mapping::CalculateLogicalSizeError; +use crate::tenant::{MaybeOffloaded, Timeline}; /// Inputs to the actual tenant sizing model /// @@ -477,7 +474,7 @@ async fn fill_logical_sizes( if cached_size.is_none() { let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap()); let parallel_size_calcs = Arc::clone(limit); - let ctx = ctx.attached_child(); + let ctx = ctx.attached_child().with_scope_timeline(&timeline); joinset.spawn( calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx) .in_current_span(), @@ -498,7 +495,9 @@ async fn fill_logical_sizes( } Err(join_error) => { // cannot really do anything, as this panic is likely a bug - error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}"); + error!( + "task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}" + ); have_any_error = Some(CalculateSyntheticSizeError::Fatal( anyhow::anyhow!(join_error) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index f9f843ef6b..ece163b24a 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -10,42 +10,40 @@ mod layer_desc; mod layer_name; pub mod merge_iterator; -use crate::config::PageServerConf; -use crate::context::{AccessStatsBehavior, RequestContext}; -use bytes::Bytes; -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use pageserver_api::key::Key; -use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::value::Value; use std::cmp::Ordering; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::future::Future; use std::ops::Range; use std::pin::Pin; -use std::sync::atomic::AtomicUsize; use std::sync::Arc; +use std::sync::atomic::AtomicUsize; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use tracing::{trace, Instrument}; -use utils::sync::gate::GateGuard; - -use utils::lsn::Lsn; pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter}; +use bytes::Bytes; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; +pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; - -pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; +use pageserver_api::record::NeonWalRecord; +use pageserver_api::value::Value; +use tracing::{Instrument, trace}; +use utils::lsn::Lsn; +use utils::sync::gate::GateGuard; use self::inmemory_layer::InMemoryLayerFileId; - -use super::timeline::{GetVectoredError, ReadPath}; use super::PageReconstructError; +use super::layer_map::InMemoryLayerDesc; +use super::timeline::{GetVectoredError, ReadPath}; +use crate::config::PageServerConf; +use crate::context::{AccessStatsBehavior, RequestContext}; pub fn range_overlaps(a: &Range, b: &Range) -> bool where @@ -510,6 +508,7 @@ impl IoConcurrency { #[cfg(test)] pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut { use std::ops::{Deref, DerefMut}; + use tracing::info; use utils::sync::gate::Gate; @@ -723,6 +722,12 @@ struct LayerToVisitId { lsn_floor: Lsn, } +#[derive(Debug, PartialEq, Eq, Hash)] +pub enum ReadableLayerWeak { + PersistentLayer(Arc), + InMemoryLayer(InMemoryLayerDesc), +} + /// Layer wrapper for the read path. Note that it is valid /// to use these layers even after external operations have /// been performed on them (compaction, freeze, etc.). @@ -875,7 +880,7 @@ impl ReadableLayer { } ReadableLayer::InMemoryLayer(layer) => { layer - .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) .await } } diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 7da51c27df..fd50e4805d 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -1,17 +1,22 @@ -use std::{future::Future, ops::Range, sync::Arc}; +use std::future::Future; +use std::ops::Range; +use std::sync::Arc; use bytes::Bytes; -use pageserver_api::key::{Key, KEY_SIZE}; -use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId}; - -use crate::tenant::storage_layer::Layer; -use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline}; +use pageserver_api::key::{KEY_SIZE, Key}; use pageserver_api::value::Value; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::shard::TenantShardId; use super::layer::S3_UPLOAD_LIMIT; use super::{ DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, }; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::tenant::Timeline; +use crate::tenant::storage_layer::Layer; pub(crate) enum BatchWriterResult { Produced(ResidentLayer), @@ -423,15 +428,10 @@ mod tests { use itertools::Itertools; use rand::{RngCore, SeedableRng}; - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::AsLayerDesc, - }, - DEFAULT_PG_VERSION, - }; - use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::AsLayerDesc; fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 7ba0e3679f..62adae1680 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -27,6 +27,38 @@ //! "values" part. The actual page images and WAL records are stored in the //! "values" part. //! +use std::collections::{HashMap, VecDeque}; +use std::fs::File; +use std::io::SeekFrom; +use std::ops::Range; +use std::os::unix::fs::FileExt; +use std::str::FromStr; +use std::sync::Arc; + +use anyhow::{Context, Result, bail, ensure}; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::shard::TenantShardId; +use pageserver_api::value::Value; +use rand::Rng; +use rand::distributions::Alphanumeric; +use serde::{Deserialize, Serialize}; +use tokio::sync::OnceCell; +use tokio_epoll_uring::IoBuf; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; @@ -42,43 +74,8 @@ use crate::tenant::vectored_blob_io::{ VectoredReadPlanner, }; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; -use crate::virtual_file::IoBufferMut; -use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; -use crate::TEMP_FILE_SUFFIX; -use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; -use anyhow::{bail, ensure, Context, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use futures::StreamExt; -use itertools::Itertools; -use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE}; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::ImageCompressionAlgorithm; -use pageserver_api::shard::TenantShardId; -use pageserver_api::value::Value; -use rand::{distributions::Alphanumeric, Rng}; -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, VecDeque}; -use std::fs::File; -use std::io::SeekFrom; -use std::ops::Range; -use std::os::unix::fs::FileExt; -use std::str::FromStr; -use std::sync::Arc; -use tokio::sync::OnceCell; -use tokio_epoll_uring::IoBuf; -use tracing::*; - -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use super::{ - AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, - ValuesReconstructState, -}; +use crate::virtual_file::{self, IoBufferMut, MaybeFatalIo, VirtualFile}; +use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; /// /// Header stored in the beginning of the file @@ -1130,10 +1127,11 @@ impl DeltaLayerInner { until: Lsn, ctx: &RequestContext, ) -> anyhow::Result { + use futures::stream::TryStreamExt; + use crate::tenant::vectored_blob_io::{ BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended, }; - use futures::stream::TryStreamExt; #[derive(Debug)] enum Item { @@ -1336,7 +1334,7 @@ impl DeltaLayerInner { block_reader, ); - tree_reader.dump().await?; + tree_reader.dump(ctx).await?; let keys = self.index_entries(ctx).await?; @@ -1599,23 +1597,21 @@ impl DeltaLayerIterator<'_> { pub(crate) mod test { use std::collections::BTreeMap; + use bytes::Bytes; use itertools::MinMaxResult; - use rand::prelude::{SeedableRng, SliceRandom, StdRng}; + use pageserver_api::value::Value; use rand::RngCore; + use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use super::*; - use crate::tenant::harness::TIMELINE_ID; + use crate::DEFAULT_PG_VERSION; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::tenant::disk_btree::tests::TestDisk; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{Tenant, Timeline}; - use crate::{ - context::DownloadBehavior, - task_mgr::TaskKind, - tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, - DEFAULT_PG_VERSION, - }; - use bytes::Bytes; - use pageserver_api::value::Value; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, @@ -1976,6 +1972,7 @@ pub(crate) mod test { .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx) .await .unwrap(); + let ctx = &ctx.with_scope_timeline(&timeline); let initdb_layer = timeline .layers @@ -2087,7 +2084,7 @@ pub(crate) mod test { .await .unwrap(); - let new_layer = new_layer.download_and_keep_resident().await.unwrap(); + let new_layer = new_layer.download_and_keep_resident(ctx).await.unwrap(); new_layer .copy_delta_prefix(&mut writer, truncate_at, ctx) diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index 8660be1fcc..8d172a1c19 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -1,18 +1,14 @@ -use std::{ops::Range, sync::Arc}; +use std::ops::Range; +use std::sync::Arc; use anyhow::bail; -use pageserver_api::{ - key::Key, - keyspace::{KeySpace, SparseKeySpace}, -}; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, SparseKeySpace}; +use pageserver_api::value::Value; use utils::lsn::Lsn; -use pageserver_api::value::Value; - -use super::{ - merge_iterator::{MergeIterator, MergeIteratorItem}, - PersistentLayerKey, -}; +use super::PersistentLayerKey; +use super::merge_iterator::{MergeIterator, MergeIteratorItem}; /// A filter iterator over merge iterators (and can be easily extended to other types of iterators). /// @@ -98,19 +94,14 @@ impl<'a> FilterIterator<'a> { #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; use pageserver_api::key::Key; use utils::lsn::Lsn; - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::produce_delta_layer, - }, - DEFAULT_PG_VERSION, - }; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::delta_layer::test::produce_delta_layer; async fn assert_filter_iter_equal( filter_iter: &mut FilterIterator<'_>, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index dc611bd6e1..2e6cee036c 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -25,6 +25,39 @@ //! layer, and offsets to the other parts. The "index" is a B-tree, //! mapping from Key to an offset in the "values" part. The //! actual page images are stored in the "values" part. +use std::collections::{HashMap, VecDeque}; +use std::fs::File; +use std::io::SeekFrom; +use std::ops::Range; +use std::os::unix::prelude::FileExt; +use std::str::FromStr; +use std::sync::Arc; + +use anyhow::{Context, Result, bail, ensure}; +use bytes::Bytes; +use camino::{Utf8Path, Utf8PathBuf}; +use hex; +use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; +use pageserver_api::value::Value; +use rand::Rng; +use rand::distributions::Alphanumeric; +use serde::{Deserialize, Serialize}; +use tokio::sync::OnceCell; +use tokio_stream::StreamExt; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use super::layer_name::ImageLayerName; +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; @@ -39,43 +72,8 @@ use crate::tenant::vectored_blob_io::{ VectoredReadPlanner, }; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; -use crate::virtual_file::IoBufferMut; -use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; +use crate::virtual_file::{self, IoBufferMut, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; -use anyhow::{bail, ensure, Context, Result}; -use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; -use hex; -use itertools::Itertools; -use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE}; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::shard::{ShardIdentity, TenantShardId}; -use pageserver_api::value::Value; -use rand::{distributions::Alphanumeric, Rng}; -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, VecDeque}; -use std::fs::File; -use std::io::SeekFrom; -use std::ops::Range; -use std::os::unix::prelude::FileExt; -use std::str::FromStr; -use std::sync::Arc; -use tokio::sync::OnceCell; -use tokio_stream::StreamExt; -use tracing::*; - -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use super::layer_name::ImageLayerName; -use super::{ - AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, - ValuesReconstructState, -}; /// /// Header stored in the beginning of the file @@ -201,7 +199,7 @@ impl ImageLayerInner { block_reader, ); - tree_reader.dump().await?; + tree_reader.dump(ctx).await?; tree_reader .visit( @@ -1135,34 +1133,26 @@ impl ImageLayerIterator<'_> { #[cfg(test)] mod test { - use std::{sync::Arc, time::Duration}; + use std::sync::Arc; + use std::time::Duration; use bytes::Bytes; use itertools::Itertools; - use pageserver_api::{ - key::Key, - shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, - value::Value, - }; - use utils::{ - generation::Generation, - id::{TenantId, TimelineId}, - lsn::Lsn, - }; - - use crate::{ - context::RequestContext, - tenant::{ - config::TenantConf, - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::{Layer, ResidentLayer}, - vectored_blob_io::StreamingVectoredReadPlanner, - Tenant, Timeline, - }, - DEFAULT_PG_VERSION, - }; + use pageserver_api::key::Key; + use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; + use pageserver_api::value::Value; + use utils::generation::Generation; + use utils::id::{TenantId, TimelineId}; + use utils::lsn::Lsn; use super::{ImageLayerIterator, ImageLayerWriter}; + use crate::DEFAULT_PG_VERSION; + use crate::context::RequestContext; + use crate::tenant::config::TenantConf; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::{Layer, ResidentLayer}; + use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; + use crate::tenant::{Tenant, Timeline}; #[tokio::test] async fn image_layer_rewrite() { @@ -1172,10 +1162,10 @@ mod test { ..TenantConf::default() }; let tenant_id = TenantId::generate(); - let mut gen = Generation::new(0xdead0001); + let mut gen_ = Generation::new(0xdead0001); let mut get_next_gen = || { - let ret = gen; - gen = gen.next(); + let ret = gen_; + gen_ = gen_.next(); ret }; // The LSN at which we will create an image layer to filter diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 61a0fdea8c..46135b5330 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -4,38 +4,39 @@ //! held in an ephemeral file, not in memory. The metadata for each page version, i.e. //! its position in the file, is kept in memory, though. //! -use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64}; +use std::cmp::Ordering; +use std::collections::{BTreeMap, HashMap}; +use std::fmt::Write; +use std::ops::Range; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering}; +use std::sync::{Arc, OnceLock}; +use std::time::Instant; + +use anyhow::Result; +use camino::Utf8PathBuf; +use pageserver_api::key::{CompactKey, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::models::InMemoryLayerInfo; +use pageserver_api::shard::TenantShardId; +use tokio::sync::RwLock; +use tracing::*; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::vec_map::VecMap; +use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta}; + +use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState}; +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; +// avoid binding to Write (conflicts with std::io::Write) +// while being able to use std::fmt::Write's methods +use crate::metrics::TIMELINE_EPHEMERAL_BYTES; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo}; use crate::tenant::timeline::GetVectoredError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache}; -use anyhow::Result; -use camino::Utf8PathBuf; -use pageserver_api::key::CompactKey; -use pageserver_api::key::Key; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::InMemoryLayerInfo; -use pageserver_api::shard::TenantShardId; -use std::collections::{BTreeMap, HashMap}; -use std::sync::{Arc, OnceLock}; -use std::time::Instant; -use tracing::*; -use utils::{id::TimelineId, lsn::Lsn, vec_map::VecMap}; -use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta}; -// avoid binding to Write (conflicts with std::io::Write) -// while being able to use std::fmt::Write's methods -use crate::metrics::TIMELINE_EPHEMERAL_BYTES; -use std::cmp::Ordering; -use std::fmt::Write; -use std::ops::Range; -use std::sync::atomic::Ordering as AtomicOrdering; -use std::sync::atomic::{AtomicU64, AtomicUsize}; -use tokio::sync::RwLock; - -use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState}; pub(crate) mod vectored_dio_read; @@ -415,7 +416,7 @@ impl InMemoryLayer { pub(crate) async fn get_values_reconstruct_data( self: &Arc, keyspace: KeySpace, - end_lsn: Lsn, + lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { @@ -432,8 +433,6 @@ impl InMemoryLayer { let mut reads: HashMap> = HashMap::new(); let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); - let lsn_range = self.start_lsn..end_lsn; - for range in keyspace.ranges.iter() { for (key, vec_map) in inner .index @@ -555,7 +554,9 @@ impl InMemoryLayer { gate: &utils::sync::gate::Gate, ctx: &RequestContext, ) -> Result { - trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); + trace!( + "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}" + ); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); @@ -816,8 +817,7 @@ mod tests { #[test] fn test_index_entry() { const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS; - use IndexEntryNewArgs as Args; - use IndexEntryUnpacked as Unpacked; + use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked}; let roundtrip = |args, expect: Unpacked| { let res = IndexEntry::new(args).expect("this tests expects no errors"); diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index 1d86015fab..90455fd0ca 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -1,16 +1,13 @@ -use std::{ - collections::BTreeMap, - sync::{Arc, RwLock}, -}; +use std::collections::BTreeMap; +use std::sync::{Arc, RwLock}; use itertools::Itertools; use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice}; -use crate::{ - assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}, - context::RequestContext, - virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut}, -}; +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; +use crate::context::RequestContext; +use crate::virtual_file::IoBufferMut; +use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`]. pub trait File: Send { @@ -132,7 +129,9 @@ where let req_len = match cur { LogicalReadState::NotStarted(buf) => { if buf.len() != 0 { - panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`"); + panic!( + "The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`" + ); } // buf.cap() == 0 is ok @@ -141,7 +140,9 @@ where *state = LogicalReadState::Ongoing(buf); req_len } - x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"), + x => panic!( + "must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}" + ), }; // plan which chunks we need to read from @@ -422,15 +423,15 @@ impl Buffer for Vec { #[cfg(test)] #[allow(clippy::assertions_on_constants)] mod tests { + use std::cell::RefCell; + use std::collections::VecDeque; + use rand::Rng; - use crate::{ - context::DownloadBehavior, task_mgr::TaskKind, - virtual_file::owned_buffers_io::slice::SliceMutExt, - }; - use super::*; - use std::{cell::RefCell, collections::VecDeque}; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; struct InMemoryFile { content: Vec, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 0bf606cf0a..247092bf45 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1,32 +1,32 @@ +use std::ops::Range; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::time::{Duration, SystemTime}; + use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; -use std::ops::Range; -use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; -use std::sync::{Arc, Weak}; -use std::time::{Duration, SystemTime}; use tracing::Instrument; +use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::{gate, heavier_once_cell}; -use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::task_mgr::TaskKind; -use crate::tenant::timeline::{CompactionError, GetVectoredError}; -use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; - use super::delta_layer::{self}; use super::image_layer::{self}; use super::{ AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, }; - -use utils::generation::Generation; +use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; +use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::timeline::{CompactionError, GetVectoredError}; #[cfg(test)] mod tests; @@ -324,16 +324,16 @@ impl Layer { reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let downloaded = self - .0 - .get_or_maybe_download(true, Some(ctx)) - .await - .map_err(|err| match err { - DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { - GetVectoredError::Cancelled - } - other => GetVectoredError::Other(anyhow::anyhow!(other)), - })?; + let downloaded = + self.0 + .get_or_maybe_download(true, ctx) + .await + .map_err(|err| match err { + DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { + GetVectoredError::Cancelled + } + other => GetVectoredError::Other(anyhow::anyhow!(other)), + })?; let this = ResidentLayer { downloaded: downloaded.clone(), owner: self.clone(), @@ -356,8 +356,8 @@ impl Layer { /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. - pub(crate) async fn download(&self) -> Result<(), DownloadError> { - self.0.get_or_maybe_download(true, None).await?; + pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> { + self.0.get_or_maybe_download(true, ctx).await?; Ok(()) } @@ -392,8 +392,11 @@ impl Layer { } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. - pub(crate) async fn download_and_keep_resident(&self) -> Result { - let downloaded = self.0.get_or_maybe_download(true, None).await?; + pub(crate) async fn download_and_keep_resident( + &self, + ctx: &RequestContext, + ) -> Result { + let downloaded = self.0.get_or_maybe_download(true, ctx).await?; Ok(ResidentLayer { downloaded, @@ -446,7 +449,7 @@ impl Layer { if verbose { // for now, unconditionally download everything, even if that might not be wanted. - let l = self.0.get_or_maybe_download(true, Some(ctx)).await?; + let l = self.0.get_or_maybe_download(true, ctx).await?; l.dump(&self.0, ctx).await? } @@ -945,7 +948,7 @@ impl LayerInner { async fn get_or_maybe_download( self: &Arc, allow_download: bool, - ctx: Option<&RequestContext>, + ctx: &RequestContext, ) -> Result, DownloadError> { let (weak, permit) = { // get_or_init_detached can: @@ -1035,21 +1038,14 @@ impl LayerInner { return Err(DownloadError::NotFile(ft)); } - if let Some(ctx) = ctx { - self.check_expected_download(ctx)?; - } + self.check_expected_download(ctx)?; if !allow_download { // this is only used from tests, but it is hard to test without the boolean return Err(DownloadError::DownloadRequired); } - let download_ctx = ctx - .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download)) - .unwrap_or(RequestContext::new( - TaskKind::LayerDownload, - DownloadBehavior::Download, - )); + let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download); async move { tracing::info!(%reason, "downloading on-demand"); @@ -1567,10 +1563,10 @@ impl LayerInner { self.access_stats.record_residence_event(); - self.status.as_ref().unwrap().send_replace(Status::Evicted); - *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now()); + self.status.as_ref().unwrap().send_replace(Status::Evicted); + Ok(()) } @@ -1873,8 +1869,8 @@ impl ResidentLayer { self.owner.record_access(ctx); let res = match inner { - Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await, - Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await, + Delta(d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await, + Image(i) => image_layer::ImageLayerInner::load_keys(i, ctx).await, }; res.with_context(|| format!("Layer index is corrupted for {self}")) } @@ -1920,7 +1916,7 @@ impl ResidentLayer { let owner = &self.owner.0; match self.downloaded.get(owner, ctx).await? { - Delta(ref d) => d + Delta(d) => d .copy_prefix(writer, until, ctx) .await .with_context(|| format!("copy_delta_prefix until {until} of {self}")), @@ -1943,7 +1939,7 @@ impl ResidentLayer { ) -> anyhow::Result<&delta_layer::DeltaLayerInner> { use LayerKind::*; match self.downloaded.get(&self.owner.0, ctx).await? { - Delta(ref d) => Ok(d), + Delta(d) => Ok(d), Image(_) => Err(anyhow::anyhow!("image layer")), } } @@ -1955,7 +1951,7 @@ impl ResidentLayer { ) -> anyhow::Result<&image_layer::ImageLayerInner> { use LayerKind::*; match self.downloaded.get(&self.owner.0, ctx).await? { - Image(ref d) => Ok(d), + Image(d) => Ok(d), Delta(_) => Err(anyhow::anyhow!("delta layer")), } } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index d93c378ffc..7086429bfe 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,22 +1,15 @@ use std::time::UNIX_EPOCH; -use pageserver_api::key::{Key, CONTROLFILE_KEY}; +use pageserver_api::key::{CONTROLFILE_KEY, Key}; use tokio::task::JoinSet; -use utils::{ - completion::{self, Completion}, - id::TimelineId, -}; +use utils::completion::{self, Completion}; +use utils::id::TimelineId; use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::{ - context::DownloadBehavior, - tenant::{ - harness::test_img, - storage_layer::{IoConcurrency, LayerVisibilityHint}, - }, -}; -use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; +use crate::context::DownloadBehavior; +use crate::tenant::harness::{TenantHarness, test_img}; +use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint}; /// Used in tests to advance a future to wanted await point, and not futher. const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600); @@ -33,11 +26,9 @@ async fn smoke_test() { let h = TenantHarness::create("smoke_test").await.unwrap(); let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); - let (tenant, _) = h.load().await; + let (tenant, ctx) = h.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); - let image_layers = vec![( Lsn(0x40), vec![( @@ -55,12 +46,14 @@ async fn smoke_test() { Lsn(0x10), 14, &ctx, + Default::default(), // in-memory layers Default::default(), image_layers, Lsn(0x100), ) .await .unwrap(); + let ctx = &ctx.with_scope_timeline(&timeline); // Grab one of the timeline's layers to exercise in the test, and the other layer that is just // there to avoid the timeline being illegally empty @@ -99,7 +92,7 @@ async fn smoke_test() { controlfile_keyspace.clone(), Lsn(0x10)..Lsn(0x11), &mut data, - &ctx, + ctx, ) .await .unwrap(); @@ -134,7 +127,7 @@ async fn smoke_test() { controlfile_keyspace.clone(), Lsn(0x10)..Lsn(0x11), &mut data, - &ctx, + ctx, ) .instrument(download_span.clone()) .await @@ -184,7 +177,7 @@ async fn smoke_test() { // plain downloading is rarely needed layer - .download_and_keep_resident() + .download_and_keep_resident(ctx) .instrument(download_span) .await .unwrap(); @@ -346,6 +339,7 @@ fn read_wins_pending_eviction() { .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await .unwrap(); + let ctx = ctx.with_scope_timeline(&timeline); let layer = { let mut layers = { @@ -385,7 +379,7 @@ fn read_wins_pending_eviction() { // because no actual eviction happened, we get to just reinitialize the DownloadedLayer layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .instrument(download_span) .await .expect("should had reinitialized without downloading"); @@ -478,6 +472,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await .unwrap(); + let ctx = ctx.with_scope_timeline(&timeline); let layer = { let mut layers = { @@ -520,7 +515,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { // because no actual eviction happened, we get to just reinitialize the DownloadedLayer layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .instrument(download_span) .await .expect("should had reinitialized without downloading"); @@ -647,7 +642,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await .unwrap(); + let ctx = ctx.with_scope_timeline(&timeline); + // This test does downloads + let ctx = RequestContextBuilder::extend(&ctx) + .download_behavior(DownloadBehavior::Download) + .build(); let layer = { let mut layers = { let layers = timeline.layers.read().await; @@ -680,7 +680,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { // simulate a cancelled read which is cancelled before it gets to re-initialize let e = layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .await .unwrap_err(); assert!( @@ -704,7 +704,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { // failpoint is still enabled, but it is not hit let e = layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .await .unwrap_err(); assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}"); @@ -727,6 +727,12 @@ async fn evict_and_wait_does_not_wait_for_download() { .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await .unwrap(); + let ctx = ctx.with_scope_timeline(&timeline); + + // This test does downloads + let ctx = RequestContextBuilder::extend(&ctx) + .download_behavior(DownloadBehavior::Download) + .build(); let layer = { let mut layers = { @@ -771,10 +777,12 @@ async fn evict_and_wait_does_not_wait_for_download() { let (arrival, _download_arrived) = utils::completion::channel(); layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier)); - let mut download = std::pin::pin!(layer - .0 - .get_or_maybe_download(true, None) - .instrument(download_span)); + let mut download = std::pin::pin!( + layer + .0 + .get_or_maybe_download(true, &ctx) + .instrument(download_span) + ); assert!( !layer.is_likely_resident(), diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index 2097e90764..ed16dcaa0d 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -1,16 +1,15 @@ use core::fmt::Display; -use pageserver_api::shard::TenantShardId; use std::ops::Range; -use utils::{id::TimelineId, lsn::Lsn}; use pageserver_api::key::Key; - -use super::{DeltaLayerName, ImageLayerName, LayerName}; - +use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; - #[cfg(test)] use utils::id::TenantId; +use utils::id::TimelineId; +use utils::lsn::Lsn; + +use super::{DeltaLayerName, ImageLayerName, LayerName}; /// A unique identifier of a persistent layer. /// diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index addf3b85d9..0f7995f87b 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -1,12 +1,12 @@ //! //! Helper functions for dealing with filenames of the image and delta layer files. //! -use pageserver_api::key::Key; use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::str::FromStr; +use pageserver_api::key::Key; use utils::lsn::Lsn; use super::PersistentLayerDesc; @@ -305,7 +305,7 @@ impl FromStr for LayerName { (None, None) => { return Err(format!( "neither delta nor image layer file name: {value:?}" - )) + )); } (Some(delta), None) => Self::Delta(delta), (None, Some(image)) => Self::Image(image), diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 19cfcb0867..76cdddd06a 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -1,21 +1,16 @@ -use std::{ - cmp::Ordering, - collections::{binary_heap, BinaryHeap}, - sync::Arc, -}; +use std::cmp::Ordering; +use std::collections::{BinaryHeap, binary_heap}; +use std::sync::Arc; use anyhow::bail; use pageserver_api::key::Key; +use pageserver_api::value::Value; use utils::lsn::Lsn; +use super::delta_layer::{DeltaLayerInner, DeltaLayerIterator}; +use super::image_layer::{ImageLayerInner, ImageLayerIterator}; +use super::{PersistentLayerDesc, PersistentLayerKey}; use crate::context::RequestContext; -use pageserver_api::value::Value; - -use super::{ - delta_layer::{DeltaLayerInner, DeltaLayerIterator}, - image_layer::{ImageLayerInner, ImageLayerIterator}, - PersistentLayerDesc, PersistentLayerKey, -}; #[derive(Clone, Copy)] pub(crate) enum LayerRef<'a> { @@ -349,24 +344,18 @@ impl<'a> MergeIterator<'a> { #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; use pageserver_api::key::Key; - use utils::lsn::Lsn; - - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}, - }, - DEFAULT_PG_VERSION, - }; - - #[cfg(feature = "testing")] - use crate::tenant::storage_layer::delta_layer::test::sort_delta_value; #[cfg(feature = "testing")] use pageserver_api::record::NeonWalRecord; + use utils::lsn::Lsn; + + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + #[cfg(feature = "testing")] + use crate::tenant::storage_layer::delta_layer::test::sort_delta_value; + use crate::tenant::storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}; async fn assert_merge_iter_equal( merge_iter: &mut MergeIterator<'_>, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 5e63f59fd8..589ac5ae88 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -8,24 +8,24 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use once_cell::sync::Lazy; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD; use rand::Rng; use scopeguard::defer; use tokio::sync::{Semaphore, SemaphorePermit}; use tokio_util::sync::CancellationToken; use tracing::*; - -use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; -use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS}; -use crate::tenant::throttle::Stats; -use crate::tenant::timeline::compaction::CompactionOutcome; -use crate::tenant::timeline::CompactionError; -use crate::tenant::{Tenant, TenantState}; -use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD; use utils::backoff::exponential_backoff_duration; use utils::completion::Barrier; use utils::pausable_failpoint; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind}; +use crate::tenant::throttle::Stats; +use crate::tenant::timeline::CompactionError; +use crate::tenant::timeline::compaction::CompactionOutcome; +use crate::tenant::{Tenant, TenantState}; + /// Semaphore limiting concurrent background tasks (across all tenants). /// /// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. @@ -287,15 +287,16 @@ fn log_compaction_error( sleep_duration: Duration, task_cancelled: bool, ) { - use crate::pgdatadir_mapping::CollectKeySpaceError; - use crate::tenant::upload_queue::NotInitialized; - use crate::tenant::PageReconstructError; use CompactionError::*; + use crate::tenant::PageReconstructError; + use crate::tenant::upload_queue::NotInitialized; + let level = match err { + e if e.is_cancel() => return, ShuttingDown => return, Offload(_) => Level::ERROR, - CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO, + AlreadyRunning(_) => Level::ERROR, CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { @@ -472,21 +473,15 @@ async fn wait_for_active_tenant( } let mut update_rx = tenant.subscribe_for_state_updates(); - loop { - tokio::select! { - _ = cancel.cancelled() => return ControlFlow::Break(()), - result = update_rx.changed() => if result.is_err() { + tokio::select! { + result = update_rx.wait_for(|s| s == &TenantState::Active) => { + if result.is_err() { return ControlFlow::Break(()); } - } - - match &*update_rx.borrow() { - TenantState::Active => { - debug!("Tenant state changed to active, continuing the task loop"); - return ControlFlow::Continue(()); - } - state => debug!("Not running the task loop, tenant is not active: {state:?}"), - } + debug!("Tenant state changed to active, continuing the task loop"); + ControlFlow::Continue(()) + }, + _ = cancel.cancelled() => ControlFlow::Break(()), } } diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 300d779125..6c37c3771b 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -1,10 +1,6 @@ -use std::{ - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, - time::Instant, -}; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Instant; use arc_swap::ArcSwap; use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 319c5e3d87..face2dfdc1 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -14,55 +14,6 @@ pub mod span; pub mod uninit; mod walreceiver; -use anyhow::{anyhow, bail, ensure, Context, Result}; -use arc_swap::{ArcSwap, ArcSwapOption}; -use bytes::Bytes; -use camino::Utf8Path; -use chrono::{DateTime, Utc}; -use compaction::CompactionOutcome; -use enumset::EnumSet; -use fail::fail_point; -use futures::FutureExt; -use futures::{stream::FuturesUnordered, StreamExt}; -use handle::ShardTimelineId; -use layer_manager::Shutdown; -use offload::OffloadError; -use once_cell::sync::Lazy; -use pageserver_api::models::PageTraceEvent; -use pageserver_api::{ - key::{ - KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, - SPARSE_RANGE, - }, - keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, - models::{ - CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, - }, - reltag::BlockNumber, - shard::{ShardIdentity, ShardNumber, TenantShardId}, -}; -use rand::Rng; -use remote_storage::DownloadError; -use serde_with::serde_as; -use storage_broker::BrokerClientChannel; -use tokio::runtime::Handle; -use tokio::sync::mpsc::Sender; -use tokio::sync::{oneshot, watch, Notify}; -use tokio_util::sync::CancellationToken; -use tracing::*; -use utils::critical; -use utils::rate_limit::RateLimit; -use utils::{ - fs_ext, - guard_arc_swap::GuardArcSwap, - pausable_failpoint, - postgres_client::PostgresClientProtocol, - sync::gate::{Gate, GateGuard}, -}; -use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; - use std::array; use std::cmp::{max, min}; use std::collections::btree_map::Entry; @@ -72,74 +23,60 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::l0_flush::{self, L0FlushGlobalState}; -use crate::tenant::storage_layer::ImageLayerName; -use crate::{ - aux_file::AuxFileSizeEstimator, - page_service::TenantManagerTypes, - tenant::{ - config::AttachmentMode, - layer_map::{LayerMap, SearchResult}, - metadata::TimelineMetadata, - storage_layer::{ - inmemory_layer::IndexEntry, BatchLayerWriter, IoConcurrency, PersistentLayerDesc, - ValueReconstructSituation, - }, - }, - walingest::WalLagCooldown, - walredo, -}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - disk_usage_eviction_task::DiskUsageEvictionInfo, - pgdatadir_mapping::CollectKeySpaceError, -}; -use crate::{ - disk_usage_eviction_task::finite_f32, - tenant::storage_layer::{ - AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState, - ValuesReconstructState, - }, -}; -use crate::{ - disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, -}; -use crate::{ - metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, -}; -use crate::{ - pgdatadir_mapping::DirectoryKind, - virtual_file::{MaybeFatalIo, VirtualFile}, -}; -use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; -use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; +use anyhow::{Context, Result, anyhow, bail, ensure}; +use arc_swap::{ArcSwap, ArcSwapOption}; +use bytes::Bytes; +use camino::Utf8Path; +use chrono::{DateTime, Utc}; +use compaction::{CompactionOutcome, GcCompactionCombinedSettings}; +use enumset::EnumSet; +use fail::fail_point; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, StreamExt}; +use handle::ShardTimelineId; +use layer_manager::Shutdown; +use offload::OffloadError; +use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; - -use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL}; -use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate}; -use crate::tenant::config::TenantConfOpt; -use pageserver_api::reltag::RelTag; -use pageserver_api::shard::ShardIndex; - -use postgres_connection::PgConnectionConfig; -use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE}; -use utils::{ - completion, - generation::Generation, - id::TimelineId, - lsn::{AtomicLsn, Lsn, RecordLsn}, - seqwait::SeqWait, - simple_rcu::{Rcu, RcuReadGuard}, +use pageserver_api::key::{ + KEY_SIZE, Key, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, + SPARSE_RANGE, }; - -use crate::task_mgr; -use crate::task_mgr::TaskKind; -use crate::tenant::gc_result::GcResult; -use crate::ZERO_PAGE; -use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}; +use pageserver_api::models::{ + CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, + DetachBehavior, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, + EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, + TimelineState, +}; +use pageserver_api::reltag::{BlockNumber, RelTag}; +use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; +#[cfg(test)] +use pageserver_api::value::Value; +use postgres_connection::PgConnectionConfig; +use postgres_ffi::v14::xlog_utils; +use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp}; +use rand::Rng; +use remote_storage::DownloadError; +use serde_with::serde_as; +use storage_broker::BrokerClientChannel; +use tokio::runtime::Handle; +use tokio::sync::mpsc::Sender; +use tokio::sync::{Notify, oneshot, watch}; +use tokio_util::sync::CancellationToken; +use tracing::*; +use utils::generation::Generation; +use utils::guard_arc_swap::GuardArcSwap; +use utils::id::TimelineId; +use utils::logging::{MonitorSlowFutureCallback, monitor_slow_future}; +use utils::lsn::{AtomicLsn, Lsn, RecordLsn}; +use utils::postgres_client::PostgresClientProtocol; +use utils::rate_limit::RateLimit; +use utils::seqwait::SeqWait; +use utils::simple_rcu::{Rcu, RcuReadGuard}; +use utils::sync::gate::{Gate, GateGuard}; +use utils::{completion, critical, fs_ext, pausable_failpoint}; +use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use self::delete::DeleteTimelineFlow; pub(super) use self::eviction_task::EvictionTaskTenantState; @@ -147,23 +84,49 @@ use self::eviction_task::EvictionTaskTimelineState; use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; - +use super::config::TenantConf; +use super::remote_timeline_client::index::{GcCompactionState, IndexPart}; +use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; +use super::secondary::heatmap::HeatMapLayer; +use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; +use super::upload_queue::NotInitialized; use super::{ - config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized, - MaybeOffloaded, + AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, + debug_assert_current_span_has_tenant_and_timeline_id, }; -use super::{ - debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline, +use crate::aux_file::AuxFileSizeEstimator; +use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32}; +use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::l0_flush::{self, L0FlushGlobalState}; +use crate::metrics::{ + DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL, + LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics, }; -use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; -use super::{ - remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError, - storage_layer::ReadableLayer, +use crate::page_service::TenantManagerTypes; +use crate::pgdatadir_mapping::{ + CalculateLogicalSizeError, CollectKeySpaceError, DirectoryKind, LsnForTimestamp, + MAX_AUX_FILE_V2_DELTAS, MetricsUpdate, }; -use super::{secondary::heatmap::HeatMapLayer, GcError}; - -#[cfg(test)] -use pageserver_api::value::Value; +use crate::task_mgr::TaskKind; +use crate::tenant::config::{AttachmentMode, TenantConfOpt}; +use crate::tenant::gc_result::GcResult; +use crate::tenant::layer_map::{LayerMap, SearchResult}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::storage_layer::delta_layer::DeltaEntry; +use crate::tenant::storage_layer::inmemory_layer::IndexEntry; +use crate::tenant::storage_layer::{ + AsLayerDesc, BatchLayerWriter, DeltaLayerWriter, EvictionError, ImageLayerName, + ImageLayerWriter, InMemoryLayer, IoConcurrency, Layer, LayerAccessStatsReset, LayerName, + PersistentLayerDesc, PersistentLayerKey, ResidentLayer, ValueReconstructSituation, + ValueReconstructState, ValuesReconstructState, +}; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::timeline::logical_size::CurrentLogicalSize; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; +use crate::walingest::WalLagCooldown; +use crate::{ZERO_PAGE, task_mgr, walredo}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(crate) enum FlushLoopState { @@ -323,7 +286,10 @@ pub struct Timeline { ancestor_timeline: Option>, ancestor_lsn: Lsn, - pub(super) metrics: TimelineMetrics, + // The LSN of gc-compaction that was last applied to this timeline. + gc_compaction_state: ArcSwap>, + + pub(crate) metrics: Arc, // `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code // in `crate::page_service` writes these metrics. @@ -473,12 +439,18 @@ pub struct Timeline { /// May host a background Tokio task which downloads all the layers from the current /// heatmap on demand. heatmap_layers_downloader: Mutex>, + + pub(crate) rel_size_v2_status: ArcSwapOption, + + wait_lsn_log_slow: tokio::sync::Semaphore, } pub(crate) enum PreviousHeatmap { Active { heatmap: HeatMapTimeline, read_at: std::time::Instant, + // End LSN covered by the heatmap if known + end_lsn: Option, }, Obsolete, } @@ -1363,10 +1335,6 @@ impl Timeline { // (this is a requirement, not a bug). Skip updating the metric in these cases // to avoid infinite results. if !results.is_empty() { - // Record the total number of layers visited towards each key in the batch. While some - // layers may not intersect with a given read, and the cost of layer visits are - // amortized across the batch, each visited layer contributes directly to the observed - // latency for every read in the batch, which is what we care about. if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD { static LOG_PACER: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); @@ -1381,9 +1349,23 @@ impl Timeline { }); } + // Records the number of layers visited in a few different ways: + // + // * LAYERS_PER_READ: all layers count towards every read in the batch, because each + // layer directly affects its observed latency. + // + // * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch + // layer visits and access cost. + // + // * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized + // read amplification after batching. + let layers_visited = layers_visited as f64; + let avg_layers_visited = layers_visited / results.len() as f64; + LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited); for _ in &results { - self.metrics.layers_per_read.observe(layers_visited as f64); - LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64); + self.metrics.layers_per_read.observe(layers_visited); + LAYERS_PER_READ_GLOBAL.observe(layers_visited); + LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited); } } @@ -1470,13 +1452,22 @@ impl Timeline { | TaskKind::WalReceiverConnectionHandler | TaskKind::WalReceiverConnectionPoller => { let is_myself = match who_is_waiting { - WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself), - WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService | WaitLsnWaiter::HttpEndpoint => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), + WaitLsnWaiter::Timeline(waiter) => { + Weak::ptr_eq(&waiter.myself, &self.myself) + } + WaitLsnWaiter::Tenant + | WaitLsnWaiter::PageService + | WaitLsnWaiter::HttpEndpoint => unreachable!( + "tenant or page_service context are not expected to have task kind {:?}", + ctx.task_kind() + ), }; if is_myself { if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here - panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock"); + panic!( + "this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock" + ); } } else { // if another timeline's is waiting for us, there's no deadlock risk because @@ -1492,25 +1483,75 @@ impl Timeline { WaitLsnTimeout::Default => self.conf.wait_lsn_timeout, }; - let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); + let timer = crate::metrics::WAIT_LSN_TIME.start_timer(); + let start_finish_counterpair_guard = self.metrics.wait_lsn_start_finish_counterpair.guard(); - match self.last_record_lsn.wait_for_timeout(lsn, timeout).await { + let wait_for_timeout = self.last_record_lsn.wait_for_timeout(lsn, timeout); + let wait_for_timeout = std::pin::pin!(wait_for_timeout); + // Use threshold of 1 because even 1 second of wait for ingest is very much abnormal. + let log_slow_threshold = Duration::from_secs(1); + // Use period of 10 to avoid flooding logs during an outage that affects all timelines. + let log_slow_period = Duration::from_secs(10); + let mut logging_permit = None; + let wait_for_timeout = monitor_slow_future( + log_slow_threshold, + log_slow_period, + wait_for_timeout, + |MonitorSlowFutureCallback { + ready, + is_slow, + elapsed_total, + elapsed_since_last_callback, + }| { + self.metrics + .wait_lsn_in_progress_micros + .inc_by(u64::try_from(elapsed_since_last_callback.as_micros()).unwrap()); + if !is_slow { + return; + } + // It's slow, see if we should log it. + // (We limit the logging to one per invocation per timeline to avoid excessive + // logging during an extended broker / networking outage that affects all timelines.) + if logging_permit.is_none() { + logging_permit = self.wait_lsn_log_slow.try_acquire().ok(); + } + if logging_permit.is_none() { + return; + } + // We log it. + if ready { + info!( + "slow wait_lsn completed after {:.3}s", + elapsed_total.as_secs_f64() + ); + } else { + info!( + "slow wait_lsn still running for {:.3}s", + elapsed_total.as_secs_f64() + ); + } + }, + ); + let res = wait_for_timeout.await; + // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo + drop(logging_permit); + drop(start_finish_counterpair_guard); + drop(timer); + match res { Ok(()) => Ok(()), Err(e) => { use utils::seqwait::SeqWaitError::*; match e { Shutdown => Err(WaitLsnError::Shutdown), Timeout => { - // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo - drop(_timer); let walreceiver_status = self.walreceiver_status(); Err(WaitLsnError::Timeout(format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", - lsn, - self.get_last_record_lsn(), - self.get_disk_consistent_lsn(), - walreceiver_status, - ))) + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", + lsn, + self.get_last_record_lsn(), + self.get_disk_consistent_lsn(), + walreceiver_status, + ))) } } } @@ -1614,10 +1655,18 @@ impl Timeline { if init || validate { let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); if lsn < *latest_gc_cutoff_lsn { - bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + bail!( + "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", + lsn, + *latest_gc_cutoff_lsn + ); } if lsn < planned_cutoff { - bail!("tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff); + bail!( + "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", + lsn, + planned_cutoff + ); } } @@ -1741,7 +1790,9 @@ impl Timeline { // This is not harmful, but it only happens in relatively rare cases where // time-based checkpoints are not happening fast enough to keep the amount of // ephemeral data within configured limits. It's a sign of stress on the system. - tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure"); + tracing::info!( + "Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure" + ); } } @@ -1867,7 +1918,9 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { - warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); + warn!( + "Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}" + ); return Ok(CompactionOutcome::Skipped); } @@ -1880,15 +1933,25 @@ impl Timeline { }; // Signal compaction failure to avoid L0 flush stalls when it's broken. - match result { + match &result { Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), - Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => { + Err(e) if e.is_cancel() => {} + Err(CompactionError::ShuttingDown) => { + // Covered by the `Err(e) if e.is_cancel()` branch. + } + Err(CompactionError::AlreadyRunning(_)) => { + // Covered by the `Err(e) if e.is_cancel()` branch. + } + Err(CompactionError::Other(_)) => { + self.compaction_failed.store(true, AtomicOrdering::Relaxed) + } + Err(CompactionError::CollectKeySpaceError(_)) => { + // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch. self.compaction_failed.store(true, AtomicOrdering::Relaxed) } // Don't change the current value on offload failure or shutdown. We don't want to // abruptly stall nor resume L0 flushes in these cases. Err(CompactionError::Offload(_)) => {} - Err(CompactionError::ShuttingDown) => {} }; result @@ -2028,7 +2091,9 @@ impl Timeline { // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but // we also do a final check here to ensure that the queue is empty. if !self.remote_client.no_pending_work() { - warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + warn!( + "still have pending work in remote upload queue, but continuing shutting down anyways" + ); } } } @@ -2037,7 +2102,9 @@ impl Timeline { // drain the upload queue self.remote_client.shutdown().await; if !self.remote_client.no_pending_work() { - warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + warn!( + "still have pending work in remote upload queue, but continuing shutting down anyways" + ); } } @@ -2199,6 +2266,7 @@ impl Timeline { pub(crate) async fn download_layer( &self, layer_file_name: &LayerName, + ctx: &RequestContext, ) -> Result, super::storage_layer::layer::DownloadError> { let Some(layer) = self .find_layer(layer_file_name) @@ -2212,7 +2280,7 @@ impl Timeline { return Ok(None); }; - layer.download().await?; + layer.download(ctx).await?; Ok(Some(true)) } @@ -2367,6 +2435,9 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path + /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is + /// possible that the index part persists the state while the config doesn't get persisted. pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2375,6 +2446,14 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) } + pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration { + self.rel_size_v2_status + .load() + .as_ref() + .map(|s| s.as_ref().clone()) + .unwrap_or(RelSizeMigration::Legacy) + } + fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2398,8 +2477,9 @@ impl Timeline { } fn get_l0_flush_delay_threshold(&self) -> Option { - // Disable L0 flushes by default. This and compaction needs further tuning. - const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3 + // By default, delay L0 flushes at 3x the compaction threshold. The compaction threshold + // defaults to 10, and L0 compaction is generally able to keep L0 counts below 30. + const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 3; // If compaction is disabled, don't delay. if self.get_compaction_period() == Duration::ZERO { @@ -2427,8 +2507,9 @@ impl Timeline { } fn get_l0_flush_stall_threshold(&self) -> Option { - // Disable L0 stalls by default. In ingest benchmarks, we see image compaction take >10 - // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long. + // Disable L0 stalls by default. Stalling can cause unavailability if L0 compaction isn't + // responsive, and it can e.g. block on other compaction via the compaction semaphore or + // sibling timelines. We need more confidence before enabling this. const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 0; // TODO: default to e.g. 5 // If compaction is disabled, don't stall. @@ -2531,6 +2612,31 @@ impl Timeline { ) } + fn get_gc_compaction_settings(&self) -> GcCompactionCombinedSettings { + let tenant_conf = &self.tenant_conf.load(); + let gc_compaction_enabled = tenant_conf + .tenant_conf + .gc_compaction_enabled + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled); + let gc_compaction_initial_threshold_kb = tenant_conf + .tenant_conf + .gc_compaction_initial_threshold_kb + .unwrap_or( + self.conf + .default_tenant_conf + .gc_compaction_initial_threshold_kb, + ); + let gc_compaction_ratio_percent = tenant_conf + .tenant_conf + .gc_compaction_ratio_percent + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent); + GcCompactionCombinedSettings { + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + } + } + fn get_image_creation_preempt_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2609,6 +2715,8 @@ impl Timeline { state: TimelineState, attach_wal_lag_cooldown: Arc>, create_idempotency: crate::tenant::CreateTimelineIdempotency, + gc_compaction_state: Option, + rel_size_v2_status: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2633,14 +2741,14 @@ impl Timeline { } Arc::new_cyclic(|myself| { - let metrics = TimelineMetrics::new( + let metrics = Arc::new(TimelineMetrics::new( &tenant_shard_id, &timeline_id, crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( "mtime", evictions_low_residence_duration_metric_threshold, ), - ); + )); let aux_file_metrics = metrics.aux_file_size_gauge.clone(); let mut result = Timeline { @@ -2667,6 +2775,8 @@ impl Timeline { }), disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), + gc_compaction_state: ArcSwap::new(Arc::new(gc_compaction_state)), + last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), last_freeze_ts: RwLock::new(Instant::now()), @@ -2765,6 +2875,10 @@ impl Timeline { previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap), heatmap_layers_downloader: Mutex::new(None), + + rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status), + + wait_lsn_log_slow: tokio::sync::Semaphore::new(1), }; result.repartition_threshold = @@ -2820,7 +2934,7 @@ impl Timeline { "layer flush task", async move { let _guard = guard; - let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); + let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error).with_scope_timeline(&self_clone); self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await; let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); assert!(matches!(*flush_loop_state, FlushLoopState::Running{..})); @@ -2831,6 +2945,30 @@ impl Timeline { ); } + pub(crate) fn update_gc_compaction_state( + &self, + gc_compaction_state: GcCompactionState, + ) -> anyhow::Result<()> { + self.gc_compaction_state + .store(Arc::new(Some(gc_compaction_state.clone()))); + self.remote_client + .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state) + } + + pub(crate) fn update_rel_size_v2_status( + &self, + rel_size_v2_status: RelSizeMigration, + ) -> anyhow::Result<()> { + self.rel_size_v2_status + .store(Some(Arc::new(rel_size_v2_status.clone()))); + self.remote_client + .schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status) + } + + pub(crate) fn get_gc_compaction_state(&self) -> Option { + self.gc_compaction_state.load_full().as_ref().clone() + } + /// Creates and starts the wal receiver. /// /// This function is expected to be called at most once per Timeline's lifecycle @@ -2899,8 +3037,9 @@ impl Timeline { disk_consistent_lsn: Lsn, index_part: IndexPart, ) -> anyhow::Result<()> { - use init::{Decision::*, Discovered, DismissedLayer}; use LayerName::*; + use init::Decision::*; + use init::{Discovered, DismissedLayer}; let mut guard = self.layers.write().await; @@ -3115,11 +3254,15 @@ impl Timeline { } TimelineState::Loading => { // Import does not return an activated timeline. - info!("discarding priority boost for logical size calculation because timeline is not yet active"); + info!( + "discarding priority boost for logical size calculation because timeline is not yet active" + ); } TimelineState::Active => { // activation should be setting the once cell - warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + warn!( + "unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work" + ); debug_assert!(false); } } @@ -3524,12 +3667,16 @@ impl Timeline { Ok(layer) } - pub(super) fn is_previous_heatmap_active(&self) -> bool { - self.previous_heatmap - .load() - .as_ref() - .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. })) - .unwrap_or(false) + pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool { + let crnt = self.previous_heatmap.load(); + match crnt.as_deref() { + Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn { + Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn, + None => true, + }, + Some(PreviousHeatmap::Obsolete) => false, + None => false, + } } /// The timeline heatmap is a hint to secondary locations from the primary location, @@ -3557,26 +3704,26 @@ impl Timeline { // heatamp. let previous_heatmap = self.previous_heatmap.load(); let visible_non_resident = match previous_heatmap.as_deref() { - Some(PreviousHeatmap::Active { heatmap, read_at }) => { - Some(heatmap.layers.iter().filter_map(|hl| { - let desc: PersistentLayerDesc = hl.name.clone().into(); - let layer = guard.try_get_from_key(&desc.key())?; + Some(PreviousHeatmap::Active { + heatmap, read_at, .. + }) => Some(heatmap.all_layers().filter_map(|hl| { + let desc: PersistentLayerDesc = hl.name.clone().into(); + let layer = guard.try_get_from_key(&desc.key())?; - if layer.visibility() == LayerVisibilityHint::Covered { - return None; - } + if layer.visibility() == LayerVisibilityHint::Covered { + return None; + } - if layer.is_likely_resident() { - return None; - } + if layer.is_likely_resident() { + return None; + } - if layer.last_evicted_at().happened_after(*read_at) { - return None; - } + if layer.last_evicted_at().happened_after(*read_at) { + return None; + } - Some((desc, hl.metadata.clone(), hl.access_time)) - })) - } + Some((desc, hl.metadata.clone(), hl.access_time, hl.cold)) + })), Some(PreviousHeatmap::Obsolete) => None, None => None, }; @@ -3591,6 +3738,7 @@ impl Timeline { layer.layer_desc().clone(), layer.metadata(), last_activity_ts, + false, // these layers are not cold )) } LayerVisibilityHint::Covered => { @@ -3617,12 +3765,14 @@ impl Timeline { // Sort layers in order of which to download first. For a large set of layers to download, we // want to prioritize those layers which are most likely to still be in the resident many minutes // or hours later: + // - Cold layers go last for convenience when a human inspects the heatmap. // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might // only exist for a few minutes before being compacted into L1s. // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner // the layer is likely to be covered by an image layer during compaction. - layers.sort_by_key(|(desc, _meta, _atime)| { + layers.sort_by_key(|(desc, _meta, _atime, cold)| { std::cmp::Reverse(( + *cold, !LayerMap::is_l0(&desc.key_range, desc.is_delta), desc.lsn_range.end, )) @@ -3630,7 +3780,9 @@ impl Timeline { let layers = layers .into_iter() - .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime)) + .map(|(desc, meta, atime, cold)| { + HeatMapLayer::new(desc.layer_name(), meta, atime, cold) + }) .collect(); Some(HeatMapTimeline::new(self.timeline_id, layers)) @@ -3650,6 +3802,7 @@ impl Timeline { name: vl.layer_desc().layer_name(), metadata: vl.metadata(), access_time: now, + cold: true, }; heatmap_layers.push(hl); } @@ -3663,6 +3816,7 @@ impl Timeline { PreviousHeatmap::Active { heatmap, read_at: Instant::now(), + end_lsn: Some(end_lsn), } } @@ -3861,39 +4015,22 @@ impl Timeline { let guard = timeline.layers.read().await; let layers = guard.layer_map()?; - let in_memory_layer = layers.find_in_memory_layer(|l| { - let start_lsn = l.get_lsn_range().start; - cont_lsn > start_lsn - }); + for range in unmapped_keyspace.ranges.iter() { + let results = layers.range_search(range.clone(), cont_lsn); - match in_memory_layer { - Some(l) => { - let lsn_range = l.get_lsn_range().start..cont_lsn; - fringe.update( - ReadableLayer::InMemoryLayer(l), - unmapped_keyspace.clone(), - lsn_range, - ); - } - None => { - for range in unmapped_keyspace.ranges.iter() { - let results = layers.range_search(range.clone(), cont_lsn); - - results - .found - .into_iter() - .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { - ( - ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), - keyspace_accum.to_keyspace(), - lsn_floor..cont_lsn, - ) - }) - .for_each(|(layer, keyspace, lsn_range)| { - fringe.update(layer, keyspace, lsn_range) - }); - } - } + results + .found + .into_iter() + .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { + ( + guard.upgrade(layer), + keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, + ) + }) + .for_each(|(layer, keyspace, lsn_range)| { + fringe.update(layer, keyspace, lsn_range) + }); } // It's safe to drop the layer map lock after planning the next round of reads. @@ -4166,10 +4303,6 @@ impl Timeline { // Stall flushes to backpressure if compaction can't keep up. This is propagated up // to WAL ingestion by having ephemeral layer rolls wait for flushes. - // - // NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so - // we can end up stalling before compaction even starts. Consider making it more - // responsive (e.g. via `watch_level0_deltas`). if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() { if l0_count >= stall_threshold { warn!( @@ -4259,10 +4392,14 @@ impl Timeline { // This path is only taken for tenants with multiple shards: single sharded tenants should // never encounter a gap in the wal. let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); - tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}"); + tracing::debug!( + "Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}" + ); if self.set_disk_consistent_lsn(frozen_to_lsn) { if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) { - tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}"); + tracing::warn!( + "Failed to schedule metadata upload after updating disk_consistent_lsn: {e}" + ); } } } @@ -4487,7 +4624,10 @@ impl Timeline { /// This function must only be used from the layer flush task. fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { let old_value = self.disk_consistent_lsn.fetch_max(new_value); - assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"); + assert!( + new_value >= old_value, + "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}" + ); self.metrics .disk_consistent_lsn_gauge @@ -4645,10 +4785,7 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self - .collect_keyspace(lsn, ctx) - .await - .map_err(CompactionError::CollectKeySpaceError)?; + let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], @@ -4782,7 +4919,9 @@ impl Timeline { // any metadata keys, keys, as that would lead to actual data // loss. if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() { - warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); + warn!( + "could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}" + ); ZERO_PAGE.clone() } else { return Err(CreateImageLayersError::from(err)); @@ -4861,7 +5000,8 @@ impl Timeline { let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; info!( - "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64() + "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", + elapsed.as_secs_f64() ); if !trigger_generation && mode == ImageLayerCreationMode::Try { @@ -5183,7 +5323,8 @@ impl Timeline { if should_yield { tracing::info!( "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers", - partition.start().unwrap(), partition.end().unwrap() + partition.start().unwrap(), + partition.end().unwrap() ); last_partition_processed = Some(partition.clone()); all_generated = false; @@ -5305,9 +5446,10 @@ impl Timeline { self: &Arc, tenant: &crate::tenant::Tenant, options: detach_ancestor::Options, + behavior: DetachBehavior, ctx: &RequestContext, ) -> Result { - detach_ancestor::prepare(self, tenant, options, ctx).await + detach_ancestor::prepare(self, tenant, behavior, options, ctx).await } /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and @@ -5323,9 +5465,21 @@ impl Timeline { self: &Arc, tenant: &crate::tenant::Tenant, prepared: detach_ancestor::PreparedTimelineDetach, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, + behavior: DetachBehavior, ctx: &RequestContext, ) -> Result { - detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await + detach_ancestor::detach_and_reparent( + self, + tenant, + prepared, + ancestor_timeline_id, + ancestor_lsn, + behavior, + ctx, + ) + .await } /// Final step which unblocks the GC. @@ -5370,9 +5524,40 @@ pub(crate) enum CompactionError { Offload(OffloadError), /// Compaction cannot be done right now; page reconstruction and so on. #[error("Failed to collect keyspace: {0}")] - CollectKeySpaceError(CollectKeySpaceError), + CollectKeySpaceError(#[from] CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), + #[error("Compaction already running: {0}")] + AlreadyRunning(&'static str), +} + +impl CompactionError { + /// Errors that can be ignored, i.e., cancel and shutdown. + pub fn is_cancel(&self) -> bool { + matches!( + self, + Self::ShuttingDown + | Self::AlreadyRunning(_) + | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled) + | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( + PageReconstructError::Cancelled + )) + | Self::Offload(OffloadError::Cancelled) + ) + } + + /// Critical errors that indicate data corruption. + pub fn is_critical(&self) -> bool { + matches!( + self, + Self::CollectKeySpaceError( + CollectKeySpaceError::Decode(_) + | CollectKeySpaceError::PageRead( + PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), + ) + ) + ) + } } impl From for CompactionError { @@ -5384,18 +5569,6 @@ impl From for CompactionError { } } -impl From for CompactionError { - fn from(err: CollectKeySpaceError) -> Self { - match err { - CollectKeySpaceError::Cancelled - | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => { - CompactionError::ShuttingDown - } - e => CompactionError::Other(e.into()), - } - } -} - impl From for CompactionError { fn from(value: super::upload_queue::NotInitialized) -> Self { match value { @@ -5479,6 +5652,14 @@ pub struct DeltaLayerTestDesc { pub data: Vec<(Key, Lsn, Value)>, } +#[cfg(test)] +#[derive(Clone)] +pub struct InMemoryLayerTestDesc { + pub lsn_range: Range, + pub data: Vec<(Key, Lsn, Value)>, + pub is_open: bool, +} + #[cfg(test)] impl DeltaLayerTestDesc { pub fn new(lsn_range: Range, key_range: Range, data: Vec<(Key, Lsn, Value)>) -> Self { @@ -5539,7 +5720,9 @@ impl Timeline { // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) { - return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); + return Err(CompactionError::Other(anyhow::anyhow!( + "compaction generates a L0 layer file as output, which will cause infinite compaction." + ))); } else { insert_layers.push(l.clone()); } @@ -5663,8 +5846,10 @@ impl Timeline { .await { Ok((index_part, index_generation, _index_mtime)) => { - tracing::info!("GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", - index_part.metadata.latest_gc_cutoff_lsn()); + tracing::info!( + "GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", + index_part.metadata.latest_gc_cutoff_lsn() + ); Ok(Some(index_part.metadata.latest_gc_cutoff_lsn())) } Err(DownloadError::NotFound) => { @@ -6073,9 +6258,7 @@ impl Timeline { if let Some((img_lsn, img)) = &data.img { trace!( "found page image for key {} at {}, no WAL redo required, req LSN {}", - key, - img_lsn, - request_lsn, + key, img_lsn, request_lsn, ); Ok(img.clone()) } else { @@ -6104,7 +6287,12 @@ impl Timeline { request_lsn ); } else { - trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); + trace!( + "found {} WAL records that will init the page for {} at {}, performing WAL redo", + data.records.len(), + key, + request_lsn + ); }; let res = self .walredo_mgr @@ -6131,6 +6319,7 @@ impl Timeline { pub(crate) async fn spawn_download_all_remote_layers( self: Arc, request: DownloadRemoteLayersTaskSpawnRequest, + ctx: &RequestContext, ) -> Result { use pageserver_api::models::DownloadRemoteLayersTaskState; @@ -6151,6 +6340,10 @@ impl Timeline { } let self_clone = Arc::clone(&self); + let task_ctx = ctx.detached_child( + TaskKind::DownloadAllRemoteLayers, + DownloadBehavior::Download, + ); let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, @@ -6158,7 +6351,7 @@ impl Timeline { Some(self.timeline_id), "download all remote layers task", async move { - self_clone.download_all_remote_layers(request).await; + self_clone.download_all_remote_layers(request, &task_ctx).await; let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); match &mut *status_guard { None => { @@ -6193,6 +6386,7 @@ impl Timeline { async fn download_all_remote_layers( self: &Arc, request: DownloadRemoteLayersTaskSpawnRequest, + ctx: &RequestContext, ) { use pageserver_api::models::DownloadRemoteLayersTaskState; @@ -6249,9 +6443,10 @@ impl Timeline { let span = tracing::info_span!("download", layer = %next); + let ctx = ctx.attached_child(); js.spawn( async move { - let res = next.download().await; + let res = next.download(&ctx).await; (next, res) } .instrument(span), @@ -6477,6 +6672,92 @@ impl Timeline { Ok(()) } + /// Force create an in-memory layer and place them into the layer map. + #[cfg(test)] + pub(super) async fn force_create_in_memory_layer( + self: &Arc, + mut in_memory: InMemoryLayerTestDesc, + check_start_lsn: Option, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + use utils::bin_ser::BeSer; + + // Validate LSNs + if let Some(check_start_lsn) = check_start_lsn { + assert!(in_memory.lsn_range.start >= check_start_lsn); + } + + let last_record_lsn = self.get_last_record_lsn(); + let layer_end_lsn = if in_memory.is_open { + in_memory + .data + .iter() + .map(|(_key, lsn, _value)| lsn) + .max() + .cloned() + } else { + Some(in_memory.lsn_range.end) + }; + + if let Some(end) = layer_end_lsn { + assert!( + end <= last_record_lsn, + "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}", + end, + last_record_lsn, + ); + } + + in_memory.data.iter().for_each(|(_key, lsn, _value)| { + assert!(*lsn >= in_memory.lsn_range.start); + assert!(*lsn < in_memory.lsn_range.end); + }); + + // Build the batch + in_memory + .data + .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); + + let data = in_memory + .data + .into_iter() + .map(|(key, lsn, value)| { + let value_size = value.serialized_size().unwrap() as usize; + (key.to_compact(), lsn, value_size, value) + }) + .collect::>(); + + let batch = SerializedValueBatch::from_values(data); + + // Create the in-memory layer and write the batch into it + let layer = InMemoryLayer::create( + self.conf, + self.timeline_id, + self.tenant_shard_id, + in_memory.lsn_range.start, + &self.gate, + ctx, + ) + .await + .unwrap(); + + layer.put_batch(batch, ctx).await.unwrap(); + if !in_memory.is_open { + layer.freeze(in_memory.lsn_range.end).await; + } + + info!("force created in-memory layer {:?}", in_memory.lsn_range); + + // Link the layer to the layer map + { + let mut guard = self.layers.write().await; + let layer_map = guard.open_mut().unwrap(); + layer_map.force_insert_in_memory_layer(Arc::new(layer)); + } + + Ok(()) + } + /// Return all keys at the LSN in the image layers #[cfg(test)] pub(crate) async fn inspect_image_layers( @@ -6648,7 +6929,9 @@ impl TimelineWriter<'_> { if let Some(wait_threshold) = wait_threshold { if l0_count >= wait_threshold { - debug!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers"); + debug!( + "layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers" + ); self.tl.wait_flush_completion(flush_id).await?; } } @@ -6834,22 +7117,22 @@ mod tests { use pageserver_api::key::Key; use pageserver_api::value::Value; + use std::iter::Iterator; use tracing::Instrument; - use utils::{id::TimelineId, lsn::Lsn}; - - use crate::tenant::{ - harness::{test_img, TenantHarness}, - layer_map::LayerMap, - storage_layer::{Layer, LayerName, LayerVisibilityHint}, - timeline::{DeltaLayerTestDesc, EvictionError}, - PreviousHeatmap, Timeline, - }; + use utils::id::TimelineId; + use utils::lsn::Lsn; use super::HeatMapTimeline; + use crate::context::RequestContextBuilder; + use crate::tenant::harness::{TenantHarness, test_img}; + use crate::tenant::layer_map::LayerMap; + use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint}; + use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError}; + use crate::tenant::{PreviousHeatmap, Timeline}; fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) { - assert_eq!(lhs.layers.len(), rhs.layers.len()); - let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter()); + assert_eq!(lhs.all_layers().count(), rhs.all_layers().count()); + let lhs_rhs = lhs.all_layers().zip(rhs.all_layers()); for (l, r) in lhs_rhs { assert_eq!(l.name, r.name); assert_eq!(l.metadata, r.metadata); @@ -6908,12 +7191,14 @@ mod tests { Lsn(0x10), 14, &ctx, + Vec::new(), // in-memory layers delta_layers, image_layers, Lsn(0x100), ) .await .unwrap(); + let ctx = &ctx.with_scope_timeline(&timeline); // Layer visibility is an input to heatmap generation, so refresh it first timeline.update_layer_visibility().await.unwrap(); @@ -6926,10 +7211,11 @@ mod tests { assert_eq!(heatmap.timeline_id, timeline.timeline_id); // L0 should come last - assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name()); + let heatmap_layers = heatmap.all_layers().collect::>(); + assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name()); let mut last_lsn = Lsn::MAX; - for layer in &heatmap.layers { + for layer in heatmap_layers { // Covered layer should be omitted assert!(layer.name != covered_delta.layer_name()); @@ -6962,6 +7248,7 @@ mod tests { .store(Some(Arc::new(PreviousHeatmap::Active { heatmap: heatmap.clone(), read_at: std::time::Instant::now(), + end_lsn: None, }))); // Generate a new heatmap and assert that it contains the same layers as the old one. @@ -6977,8 +7264,12 @@ mod tests { eprintln!("Downloading {layer} and re-generating heatmap"); + let ctx = &RequestContextBuilder::extend(ctx) + .download_behavior(crate::context::DownloadBehavior::Download) + .build(); + let _resident = layer - .download_and_keep_resident() + .download_and_keep_resident(ctx) .instrument(tracing::info_span!( parent: None, "download_layer", @@ -7036,6 +7327,7 @@ mod tests { Lsn(0x10), 14, &ctx, + Vec::new(), // in-memory layers delta_layers, image_layers, Lsn(0x100), @@ -7052,7 +7344,7 @@ mod tests { .expect("Infallible while timeline is not shut down"); // Both layers should be in the heatmap - assert!(!heatmap.layers.is_empty()); + assert!(heatmap.all_layers().count() > 0); // Now simulate a migration. timeline @@ -7060,6 +7352,7 @@ mod tests { .store(Some(Arc::new(PreviousHeatmap::Active { heatmap: heatmap.clone(), read_at: std::time::Instant::now(), + end_lsn: None, }))); // Evict all the layers in the previous heatmap @@ -7077,7 +7370,7 @@ mod tests { .await .expect("Infallible while timeline is not shut down"); - assert!(post_eviction_heatmap.layers.is_empty()); + assert_eq!(post_eviction_heatmap.all_layers().count(), 0); assert!(matches!( timeline.previous_heatmap.load().as_deref(), Some(PreviousHeatmap::Obsolete) diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs index 6009b0b79a..96864ec44b 100644 --- a/pageserver/src/tenant/timeline/analysis.rs +++ b/pageserver/src/tenant/timeline/analysis.rs @@ -1,4 +1,5 @@ -use std::{collections::BTreeSet, ops::Range}; +use std::collections::BTreeSet; +use std::ops::Range; use utils::lsn::Lsn; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index d75591bd74..300daec9bf 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -7,37 +7,47 @@ use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; +use std::time::Instant; use super::layer_manager::LayerManager; use super::{ - CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError, - ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration, + CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, + GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline, }; -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow}; use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; +use futures::FutureExt; use itertools::Itertools; -use pageserver_api::key::KEY_SIZE; -use pageserver_api::keyspace::ShardedRange; +use once_cell::sync::Lazy; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; +use pageserver_api::key::{KEY_SIZE, Key}; +use pageserver_api::keyspace::{KeySpace, ShardedRange}; use pageserver_api::models::CompactInfoResponse; +use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; +use pageserver_api::value::Value; +use pageserver_compaction::helpers::{fully_contains, overlaps_with}; +use pageserver_compaction::interface::*; use serde::Serialize; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, trace, warn, Instrument}; +use tracing::{Instrument, debug, error, info, info_span, trace, warn}; use utils::critical; use utils::id::TimelineId; +use utils::lsn::Lsn; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; -use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::gc_block::GcBlock; use crate::tenant::layer_map::LayerMap; use crate::tenant::remote_timeline_client::WaitCompletionError; +use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, }; @@ -46,24 +56,12 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::{ AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, }; -use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; -use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency}; -use crate::tenant::timeline::{Layer, ResidentLayer}; -use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded}; +use crate::tenant::timeline::{ + DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, + ResidentLayer, drop_rlock, +}; +use crate::tenant::{DeltaLayer, MaybeOffloaded, gc_block}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; -use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; - -use pageserver_api::key::Key; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::value::Value; - -use utils::lsn::Lsn; - -use pageserver_compaction::helpers::{fully_contains, overlaps_with}; -use pageserver_compaction::interface::*; - -use super::CompactionError; /// Maximum number of deltas before generating an image layer in bottom-most compaction. const COMPACTION_DELTA_THRESHOLD: usize = 5; @@ -77,13 +75,22 @@ impl std::fmt::Display for GcCompactionJobId { } } +pub struct GcCompactionCombinedSettings { + pub gc_compaction_enabled: bool, + pub gc_compaction_initial_threshold_kb: u64, + pub gc_compaction_ratio_percent: u64, +} + #[derive(Debug, Clone)] pub enum GcCompactionQueueItem { - Manual(CompactOptions), + MetaJob { + /// Compaction options + options: CompactOptions, + /// Whether the compaction is triggered automatically (determines whether we need to update L2 LSN) + auto: bool, + }, SubCompactionJob(CompactOptions), - #[allow(dead_code)] - UpdateL2Lsn(Lsn), - Notify(GcCompactionJobId), + Notify(GcCompactionJobId, Option), } impl GcCompactionQueueItem { @@ -93,7 +100,7 @@ impl GcCompactionQueueItem { running: bool, ) -> Option { match self { - GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse { + GcCompactionQueueItem::MetaJob { options, .. } => Some(CompactInfoResponse { compact_key_range: options.compact_key_range, compact_lsn_range: options.compact_lsn_range, sub_compaction: options.sub_compaction, @@ -107,17 +114,22 @@ impl GcCompactionQueueItem { running, job_id: id.0, }), - GcCompactionQueueItem::UpdateL2Lsn(_) => None, - GcCompactionQueueItem::Notify(_) => None, + GcCompactionQueueItem::Notify(_, _) => None, } } } +#[derive(Default)] +struct GcCompactionGuardItems { + notify: Option>, + gc_guard: Option, + permit: Option, +} + struct GcCompactionQueueInner { running: Option<(GcCompactionJobId, GcCompactionQueueItem)>, queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, - notify: HashMap>, - gc_guards: HashMap, + guards: HashMap, last_id: GcCompactionJobId, } @@ -137,14 +149,18 @@ pub struct GcCompactionQueue { consumer_lock: tokio::sync::Mutex<()>, } +static CONCURRENT_GC_COMPACTION_TASKS: Lazy> = Lazy::new(|| { + // Only allow two timelines on one pageserver to run gc compaction at a time. + Arc::new(Semaphore::new(2)) +}); + impl GcCompactionQueue { pub fn new() -> Self { GcCompactionQueue { inner: std::sync::Mutex::new(GcCompactionQueueInner { running: None, queued: VecDeque::new(), - notify: HashMap::new(), - gc_guards: HashMap::new(), + guards: HashMap::new(), last_id: GcCompactionJobId(0), }), consumer_lock: tokio::sync::Mutex::new(()), @@ -154,8 +170,9 @@ impl GcCompactionQueue { pub fn cancel_scheduled(&self) { let mut guard = self.inner.lock().unwrap(); guard.queued.clear(); - guard.notify.clear(); - guard.gc_guards.clear(); + // TODO: if there is a running job, we should keep the gc guard. However, currently, the cancel + // API is only used for testing purposes, so we can drop everything here. + guard.guards.clear(); } /// Schedule a manual compaction job. @@ -166,29 +183,169 @@ impl GcCompactionQueue { ) -> GcCompactionJobId { let mut guard = self.inner.lock().unwrap(); let id = guard.next_id(); - guard - .queued - .push_back((id, GcCompactionQueueItem::Manual(options))); - if let Some(notify) = notify { - guard.notify.insert(id, notify); - } + guard.queued.push_back(( + id, + GcCompactionQueueItem::MetaJob { + options, + auto: false, + }, + )); + guard.guards.entry(id).or_default().notify = notify; info!("scheduled compaction job id={}", id); id } + /// Schedule an auto compaction job. + fn schedule_auto_compaction( + &self, + options: CompactOptions, + permit: OwnedSemaphorePermit, + ) -> GcCompactionJobId { + let mut guard = self.inner.lock().unwrap(); + let id = guard.next_id(); + guard.queued.push_back(( + id, + GcCompactionQueueItem::MetaJob { + options, + auto: true, + }, + )); + guard.guards.entry(id).or_default().permit = Some(permit); + id + } + /// Trigger an auto compaction. - #[allow(dead_code)] - pub fn trigger_auto_compaction(&self, _: &Arc) {} + pub async fn trigger_auto_compaction( + &self, + timeline: &Arc, + ) -> Result<(), CompactionError> { + let GcCompactionCombinedSettings { + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + } = timeline.get_gc_compaction_settings(); + if !gc_compaction_enabled { + return Ok(()); + } + if self.remaining_jobs_num() > 0 { + // Only schedule auto compaction when the queue is empty + return Ok(()); + } + if timeline.ancestor_timeline().is_some() { + // Do not trigger auto compaction for child timelines. We haven't tested + // it enough in staging yet. + return Ok(()); + } + if timeline.get_gc_compaction_watermark() == Lsn::INVALID { + // If the gc watermark is not set, we don't need to trigger auto compaction. + // This check is the same as in `gc_compaction_split_jobs` but we don't log + // here and we can also skip the computation of the trigger condition earlier. + return Ok(()); + } + + let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else { + // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure + // the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger` + // to ensure the fairness while avoid starving other tasks. + return Ok(()); + }; + + let gc_compaction_state = timeline.get_gc_compaction_state(); + let l2_lsn = gc_compaction_state + .map(|x| x.last_completed_lsn) + .unwrap_or(Lsn::INVALID); + + let layers = { + let guard = timeline.layers.read().await; + let layer_map = guard.layer_map()?; + layer_map.iter_historic_layers().collect_vec() + }; + let mut l2_size: u64 = 0; + let mut l1_size = 0; + let gc_cutoff = *timeline.get_applied_gc_cutoff_lsn(); + for layer in layers { + if layer.lsn_range.start <= l2_lsn { + l2_size += layer.file_size(); + } else if layer.lsn_range.start <= gc_cutoff { + l1_size += layer.file_size(); + } + } + + fn trigger_compaction( + l1_size: u64, + l2_size: u64, + gc_compaction_initial_threshold_kb: u64, + gc_compaction_ratio_percent: u64, + ) -> bool { + const AUTO_TRIGGER_LIMIT: u64 = 150 * 1024 * 1024 * 1024; // 150GB + if l1_size >= AUTO_TRIGGER_LIMIT || l2_size >= AUTO_TRIGGER_LIMIT { + // Do not auto-trigger when physical size >= 150GB + return false; + } + // initial trigger + if l2_size == 0 && l1_size >= gc_compaction_initial_threshold_kb * 1024 { + info!( + "trigger auto-compaction because l1_size={} >= gc_compaction_initial_threshold_kb={}", + l1_size, gc_compaction_initial_threshold_kb + ); + return true; + } + // size ratio trigger + if l2_size == 0 { + return false; + } + if l1_size as f64 / l2_size as f64 >= (gc_compaction_ratio_percent as f64 / 100.0) { + info!( + "trigger auto-compaction because l1_size={} / l2_size={} > gc_compaction_ratio_percent={}", + l1_size, l2_size, gc_compaction_ratio_percent + ); + return true; + } + false + } + + if trigger_compaction( + l1_size, + l2_size, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + ) { + self.schedule_auto_compaction( + CompactOptions { + flags: { + let mut flags = EnumSet::new(); + flags |= CompactFlags::EnhancedGcBottomMostCompaction; + flags + }, + sub_compaction: true, + compact_key_range: None, + compact_lsn_range: None, + sub_compaction_max_job_size_mb: None, + }, + permit, + ); + info!( + "scheduled auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", + l1_size, l2_size, l2_lsn, gc_cutoff + ); + } else { + debug!( + "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", + l1_size, l2_size, l2_lsn, gc_cutoff + ); + } + Ok(()) + } /// Notify the caller the job has finished and unblock GC. fn notify_and_unblock(&self, id: GcCompactionJobId) { info!("compaction job id={} finished", id); let mut guard = self.inner.lock().unwrap(); - if let Some(blocking) = guard.gc_guards.remove(&id) { - drop(blocking) - } - if let Some(tx) = guard.notify.remove(&id) { - let _ = tx.send(()); + if let Some(items) = guard.guards.remove(&id) { + drop(items.gc_guard); + if let Some(tx) = items.notify { + let _ = tx.send(()); + } } } @@ -198,15 +355,17 @@ impl GcCompactionQueue { options: CompactOptions, timeline: &Arc, gc_block: &GcBlock, + auto: bool, ) -> Result<(), CompactionError> { - info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); - let jobs: Vec = timeline + info!( + "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); + let jobs = timeline .gc_compaction_split_jobs( GcCompactJob::from_compact_options(options.clone()), options.sub_compaction_max_job_size_mb, ) - .await - .map_err(CompactionError::Other)?; + .await?; if jobs.is_empty() { info!("no jobs to run, skipping scheduled compaction task"); self.notify_and_unblock(id); @@ -223,6 +382,9 @@ impl GcCompactionQueue { let jobs_len = jobs.len(); let mut pending_tasks = Vec::new(); + // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate. + // And therefore, we simply assume the maximum LSN of all jobs is the expected L2 LSN. + let expected_l2_lsn = jobs.iter().map(|job| job.compact_lsn_range.end).max(); for job in jobs { // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` // until we do further refactors to allow directly call `compact_with_gc`. @@ -231,6 +393,9 @@ impl GcCompactionQueue { if job.dry_run { flags |= CompactFlags::DryRun; } + if options.flags.contains(CompactFlags::NoYield) { + flags |= CompactFlags::NoYield; + } let options = CompactOptions { flags, sub_compaction: false, @@ -240,10 +405,16 @@ impl GcCompactionQueue { }; pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options)); } - pending_tasks.push(GcCompactionQueueItem::Notify(id)); + + if !auto { + pending_tasks.push(GcCompactionQueueItem::Notify(id, None)); + } else { + pending_tasks.push(GcCompactionQueueItem::Notify(id, expected_l2_lsn)); + } + { let mut guard = self.inner.lock().unwrap(); - guard.gc_guards.insert(id, gc_guard); + guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); let mut tasks = Vec::new(); for task in pending_tasks { let id = guard.next_id(); @@ -254,7 +425,10 @@ impl GcCompactionQueue { guard.queued.push_front(item); } } - info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len); + info!( + "scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", + jobs_len + ); } Ok(()) } @@ -267,29 +441,49 @@ impl GcCompactionQueue { gc_block: &GcBlock, timeline: &Arc, ) -> Result { - let _one_op_at_a_time_guard = self.consumer_lock.lock().await; - let has_pending_tasks; - let (id, item) = { - let mut guard = self.inner.lock().unwrap(); - let Some((id, item)) = guard.queued.pop_front() else { - return Ok(CompactionOutcome::Done); - }; - guard.running = Some((id, item.clone())); - has_pending_tasks = !guard.queued.is_empty(); - (id, item) + let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { + return Err(CompactionError::AlreadyRunning( + "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.", + )); + }; + let has_pending_tasks; + let mut yield_for_l0 = false; + let Some((id, item)) = ({ + let mut guard = self.inner.lock().unwrap(); + if let Some((id, item)) = guard.queued.pop_front() { + guard.running = Some((id, item.clone())); + has_pending_tasks = !guard.queued.is_empty(); + Some((id, item)) + } else { + has_pending_tasks = false; + None + } + }) else { + self.trigger_auto_compaction(timeline).await?; + // Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we + // have not implemented preemption mechanism yet. We always want to yield it to more important + // tasks if there is one. + return Ok(CompactionOutcome::Done); }; - match item { - GcCompactionQueueItem::Manual(options) => { + GcCompactionQueueItem::MetaJob { options, auto } => { if !options .flags .contains(CompactFlags::EnhancedGcBottomMostCompaction) { - warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options); + warn!( + "ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", + options + ); } else if options.sub_compaction { - self.handle_sub_compaction(id, options, timeline, gc_block) + info!( + "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); + self.handle_sub_compaction(id, options, timeline, gc_block, auto) .await?; } else { + // Auto compaction always enables sub-compaction so we don't need to handle update_l2_lsn + // in this branch. let gc_guard = match gc_block.start().await { Ok(guard) => guard, Err(e) => { @@ -301,27 +495,57 @@ impl GcCompactionQueue { }; { let mut guard = self.inner.lock().unwrap(); - guard.gc_guards.insert(id, gc_guard); + guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); } - let _ = timeline.compact_with_options(cancel, options, ctx).await?; + let compaction_result = + timeline.compact_with_options(cancel, options, ctx).await?; self.notify_and_unblock(id); + if compaction_result == CompactionOutcome::YieldForL0 { + yield_for_l0 = true; + } } } GcCompactionQueueItem::SubCompactionJob(options) => { - let _ = timeline.compact_with_options(cancel, options, ctx).await?; + // TODO: error handling, clear the queue if any task fails? + let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?; + if compaction_result == CompactionOutcome::YieldForL0 { + // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running + // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because + // we need to clean things up before returning from the function. + yield_for_l0 = true; + } } - GcCompactionQueueItem::Notify(id) => { + GcCompactionQueueItem::Notify(id, l2_lsn) => { self.notify_and_unblock(id); - } - GcCompactionQueueItem::UpdateL2Lsn(_) => { - unreachable!() + if let Some(l2_lsn) = l2_lsn { + let current_l2_lsn = timeline + .get_gc_compaction_state() + .map(|x| x.last_completed_lsn) + .unwrap_or(Lsn::INVALID); + if l2_lsn >= current_l2_lsn { + info!("l2_lsn updated to {}", l2_lsn); + timeline + .update_gc_compaction_state(GcCompactionState { + last_completed_lsn: l2_lsn, + }) + .map_err(CompactionError::Other)?; + } else { + warn!( + "l2_lsn updated to {} but it is less than the current l2_lsn {}", + l2_lsn, current_l2_lsn + ); + } + } } } { let mut guard = self.inner.lock().unwrap(); guard.running = None; } - Ok(if has_pending_tasks { + Ok(if yield_for_l0 { + tracing::info!("give up gc-compaction: yield for L0 compaction"); + CompactionOutcome::YieldForL0 + } else if has_pending_tasks { CompactionOutcome::Pending } else { CompactionOutcome::Done @@ -339,7 +563,6 @@ impl GcCompactionQueue { (guard.running.clone(), guard.queued.clone()) } - #[allow(dead_code)] pub fn remaining_jobs_num(&self) -> usize { let guard = self.inner.lock().unwrap(); guard.queued.len() + if guard.running.is_some() { 1 } else { 0 } @@ -520,17 +743,41 @@ struct CompactionStatisticsNumSize { #[derive(Debug, Serialize, Default)] pub struct CompactionStatistics { + /// Delta layer visited (maybe compressed, physical size) delta_layer_visited: CompactionStatisticsNumSize, + /// Image layer visited (maybe compressed, physical size) image_layer_visited: CompactionStatisticsNumSize, + /// Delta layer produced (maybe compressed, physical size) delta_layer_produced: CompactionStatisticsNumSize, + /// Image layer produced (maybe compressed, physical size) image_layer_produced: CompactionStatisticsNumSize, - num_delta_layer_discarded: usize, - num_image_layer_discarded: usize, + /// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer) + delta_layer_discarded: CompactionStatisticsNumSize, + /// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer) + image_layer_discarded: CompactionStatisticsNumSize, num_unique_keys_visited: usize, + /// Delta visited (uncompressed, original size) wal_keys_visited: CompactionStatisticsNumSize, + /// Image visited (uncompressed, original size) image_keys_visited: CompactionStatisticsNumSize, + /// Delta produced (uncompressed, original size) wal_produced: CompactionStatisticsNumSize, + /// Image produced (uncompressed, original size) image_produced: CompactionStatisticsNumSize, + + // Time spent in each phase + time_acquire_lock_secs: f64, + time_analyze_secs: f64, + time_download_layer_secs: f64, + time_main_loop_secs: f64, + time_final_phase_secs: f64, + time_total_secs: f64, + + // Summary + /// Ratio of the key-value size before/after gc-compaction. + uncompressed_size_ratio: f64, + /// Ratio of the physical size before/after gc-compaction. + physical_size_ratio: f64, } impl CompactionStatistics { @@ -580,11 +827,13 @@ impl CompactionStatistics { self.image_produced.num += 1; self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64; } - fn discard_delta_layer(&mut self) { - self.num_delta_layer_discarded += 1; + fn discard_delta_layer(&mut self, original_size: u64) { + self.delta_layer_discarded.num += 1; + self.delta_layer_discarded.size += original_size; } - fn discard_image_layer(&mut self) { - self.num_image_layer_discarded += 1; + fn discard_image_layer(&mut self, original_size: u64) { + self.image_layer_discarded.num += 1; + self.image_layer_discarded.size += original_size; } fn produce_delta_layer(&mut self, size: u64) { self.delta_layer_produced.num += 1; @@ -594,6 +843,19 @@ impl CompactionStatistics { self.image_layer_produced.num += 1; self.image_layer_produced.size += size; } + fn finalize(&mut self) { + let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size; + let produced_key_value_size = self.image_produced.size + self.wal_produced.size; + self.uncompressed_size_ratio = + original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0 + let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size; + let produced_physical_size = self.image_layer_produced.size + + self.delta_layer_produced.size + + self.image_layer_discarded.size + + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate + self.physical_size_ratio = + original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0 + } } #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] @@ -626,9 +888,7 @@ impl Timeline { .flags .contains(CompactFlags::EnhancedGcBottomMostCompaction) { - self.compact_with_gc(cancel, options, ctx) - .await - .map_err(CompactionError::Other)?; + self.compact_with_gc(cancel, options, ctx).await?; return Ok(CompactionOutcome::Done); } @@ -771,24 +1031,21 @@ impl Timeline { self.upload_new_image_layers(image_layers)?; if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { // Yield and do not do any other kind of compaction. - info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); + info!( + "skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)." + ); return Ok(CompactionOutcome::YieldForL0); } } // Suppress errors when cancelled. Err(_) if self.cancel.is_cancelled() => {} - Err(CompactionError::ShuttingDown) => {} + Err(err) if err.is_cancel() => {} // Alert on critical errors that indicate data corruption. - Err( - err @ CompactionError::CollectKeySpaceError( - CollectKeySpaceError::Decode(_) - | CollectKeySpaceError::PageRead( - PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), - ), - ), - ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"), + Err(err) if err.is_critical() => { + critical!("could not compact, repartitioning keyspace failed: {err:?}"); + } // Log other errors. No partitioning? This is normal, if the timeline was just created // as an empty timeline. Also in unit tests, when we use the timeline as a simple @@ -796,7 +1053,7 @@ impl Timeline { Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"), }; - let partition_count = self.partitioning.read().0 .0.parts.len(); + let partition_count = self.partitioning.read().0.0.parts.len(); // 4. Shard ancestor compaction @@ -837,7 +1094,7 @@ impl Timeline { let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn(); tracing::info!( - "latest_gc_cutoff: {}, pitr cutoff {}", + "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}", *latest_gc_cutoff, self.gc_info.read().unwrap().cutoffs.time ); @@ -866,6 +1123,7 @@ impl Timeline { // Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being // wrong. If ShardedRange claims the local page count is zero, then no keys in this layer // should be !is_key_disposable() + // TODO: exclude sparse keyspace from this check, otherwise it will infinitely loop. let range = layer_desc.get_key_range(); let mut key = range.start; while key < range.end { @@ -961,7 +1219,7 @@ impl Timeline { // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: // - GC, which at worst witnesses us "undelete" a layer that they just deleted. // - ingestion, which only inserts layers, therefore cannot collide with us. - let resident = layer.download_and_keep_resident().await?; + let resident = layer.download_and_keep_resident(ctx).await?; let keys_written = resident .filter(&self.shard_identity, &mut image_layer_writer, ctx) @@ -1005,7 +1263,7 @@ impl Timeline { Ok(()) => (), Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { - return Err(CompactionError::ShuttingDown) + return Err(CompactionError::ShuttingDown); } } @@ -1189,14 +1447,14 @@ impl Timeline { let mut fully_compacted = true; - deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); + deltas_to_compact.push(first_level0_delta.download_and_keep_resident(ctx).await?); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; if lsn_range.start != prev_lsn_end { break; } - deltas_to_compact.push(l.download_and_keep_resident().await?); + deltas_to_compact.push(l.download_and_keep_resident(ctx).await?); deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; @@ -1300,7 +1558,7 @@ impl Timeline { let last_record_lsn = self.get_last_record_lsn(); let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; let min_hole_coverage_size = 3; // TODO: something more flexible? - // min-heap (reserve space for one more element added before eviction) + // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); let mut prev: Option = None; @@ -2149,12 +2407,19 @@ impl Timeline { async fn check_compaction_space( self: &Arc, layer_selection: &[Layer], - ) -> anyhow::Result<()> { - let available_space = self.check_available_space().await?; + ) -> Result<(), CompactionError> { + let available_space = self + .check_available_space() + .await + .map_err(CompactionError::Other)?; let mut remote_layer_size = 0; let mut all_layer_size = 0; for layer in layer_selection { - let needs_download = layer.needs_download().await?; + let needs_download = layer + .needs_download() + .await + .context("failed to check if layer needs download") + .map_err(CompactionError::Other)?; if needs_download.is_some() { remote_layer_size += layer.layer_desc().file_size; } @@ -2163,8 +2428,14 @@ impl Timeline { let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space { - return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", - available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size)); + return Err(CompactionError::Other(anyhow!( + "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", + available_space, + allocated_space, + all_layer_size, + remote_layer_size, + all_layer_size + remote_layer_size + ))); } Ok(()) } @@ -2195,7 +2466,7 @@ impl Timeline { self: &Arc, job: GcCompactJob, sub_compaction_max_job_size_mb: Option, - ) -> anyhow::Result> { + ) -> Result, CompactionError> { let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX { job.compact_lsn_range.end } else { @@ -2203,7 +2474,9 @@ impl Timeline { }; if compact_below_lsn == Lsn::INVALID { - tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"); + tracing::warn!( + "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" + ); return Ok(vec![]); } @@ -2344,11 +2617,14 @@ impl Timeline { cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result { let sub_compaction = options.sub_compaction; let job = GcCompactJob::from_compact_options(options.clone()); + let no_yield = options.flags.contains(CompactFlags::NoYield); if sub_compaction { - info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); + info!( + "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); let jobs = self .gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb) .await?; @@ -2359,14 +2635,15 @@ impl Timeline { idx + 1, jobs_len ); - self.compact_with_gc_inner(cancel, job, ctx).await?; + self.compact_with_gc_inner(cancel, job, ctx, no_yield) + .await?; } if jobs_len == 0 { info!("no jobs to run, skipping gc bottom-most compaction"); } - return Ok(()); + return Ok(CompactionOutcome::Done); } - self.compact_with_gc_inner(cancel, job, ctx).await + self.compact_with_gc_inner(cancel, job, ctx, no_yield).await } async fn compact_with_gc_inner( @@ -2374,19 +2651,25 @@ impl Timeline { cancel: &CancellationToken, job: GcCompactJob, ctx: &RequestContext, - ) -> anyhow::Result<()> { + no_yield: bool, + ) -> Result { // Block other compaction/GC tasks from running for now. GC-compaction could run along // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. // Note that we already acquired the compaction lock when the outer `compact` function gets called. + let timer = Instant::now(); + let begin_timer = timer; + let gc_lock = async { tokio::select! { guard = self.gc_lock.lock() => Ok(guard), - // TODO: refactor to CompactionError to correctly pass cancelled error - _ = cancel.cancelled() => Err(anyhow!("cancelled")), + _ = cancel.cancelled() => Err(CompactionError::ShuttingDown), } }; + let time_acquire_lock = timer.elapsed(); + let timer = Instant::now(); + let gc_lock = crate::timed( gc_lock, "acquires gc lock", @@ -2400,7 +2683,13 @@ impl Timeline { let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing"); - info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end); + info!( + "running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", + compact_key_range.start, + compact_key_range.end, + compact_lsn_range.start, + compact_lsn_range.end + ); scopeguard::defer! { info!("done enhanced gc bottom-most compaction"); @@ -2429,15 +2718,20 @@ impl Timeline { let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX { if real_gc_cutoff == Lsn::INVALID { // If the gc_cutoff is not generated yet, we should not compact anything. - tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"); - return Ok(()); + tracing::warn!( + "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" + ); + return Ok(CompactionOutcome::Skipped); } real_gc_cutoff } else { compact_lsn_range.end }; if gc_cutoff > real_gc_cutoff { - warn!("provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff); + warn!( + "provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", + gc_cutoff, real_gc_cutoff + ); gc_cutoff = real_gc_cutoff; } gc_cutoff @@ -2461,8 +2755,11 @@ impl Timeline { .map(|desc| desc.get_lsn_range().end) .max() else { - info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff); - return Ok(()); + info!( + "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", + gc_cutoff + ); + return Ok(CompactionOutcome::Done); }; // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if @@ -2479,8 +2776,11 @@ impl Timeline { .map(|desc| desc.get_lsn_range().start) .min() else { - info!("no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end); - return Ok(()); + info!( + "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", + compact_lsn_range.end + ); + return Ok(CompactionOutcome::Done); }; // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key // layers to compact. @@ -2502,8 +2802,11 @@ impl Timeline { } } if selected_layers.is_empty() { - info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end); - return Ok(()); + info!( + "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", + gc_cutoff, compact_key_range.start, compact_key_range.end + ); + return Ok(CompactionOutcome::Done); } retain_lsns_below_horizon.sort(); GcCompactionJobDescription { @@ -2556,6 +2859,9 @@ impl Timeline { has_data_below, ); + let time_analyze = timer.elapsed(); + let timer = Instant::now(); + for layer in &job_desc.selected_layers { debug!("read layer: {}", layer.layer_desc().key()); } @@ -2584,7 +2890,10 @@ impl Timeline { .map(|layer| layer.layer_desc().layer_name()) .collect_vec(); if let Some(err) = check_valid_layermap(&layer_names) { - bail!("gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err); + return Err(CompactionError::Other(anyhow!( + "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", + err + ))); } // The maximum LSN we are processing in this compaction loop let end_lsn = job_desc @@ -2599,11 +2908,37 @@ impl Timeline { let mut total_downloaded_size = 0; let mut total_layer_size = 0; for layer in &job_desc.selected_layers { - if layer.needs_download().await?.is_some() { + if layer + .needs_download() + .await + .context("failed to check if layer needs download") + .map_err(CompactionError::Other)? + .is_some() + { total_downloaded_size += layer.layer_desc().file_size; } total_layer_size += layer.layer_desc().file_size; - let resident_layer = layer.download_and_keep_resident().await?; + if cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + if !no_yield { + let should_yield = self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!( + "preempt gc-compaction when downloading layers: too many L0 layers" + ); + return Ok(CompactionOutcome::YieldForL0); + } + } + let resident_layer = layer + .download_and_keep_resident(ctx) + .await + .context("failed to download and keep resident layer") + .map_err(CompactionError::Other)?; downloaded_layers.push(resident_layer); } info!( @@ -2614,19 +2949,36 @@ impl Timeline { ); for resident_layer in &downloaded_layers { if resident_layer.layer_desc().is_delta() { - let layer = resident_layer.get_as_delta(ctx).await?; + let layer = resident_layer + .get_as_delta(ctx) + .await + .context("failed to get delta layer") + .map_err(CompactionError::Other)?; delta_layers.push(layer); } else { - let layer = resident_layer.get_as_image(ctx).await?; + let layer = resident_layer + .get_as_image(ctx) + .await + .context("failed to get image layer") + .map_err(CompactionError::Other)?; image_layers.push(layer); } } - let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?; + let (dense_ks, sparse_ks) = self + .collect_gc_compaction_keyspace() + .await + .context("failed to collect gc compaction keyspace") + .map_err(CompactionError::Other)?; let mut merge_iter = FilterIterator::create( MergeIterator::create(&delta_layers, &image_layers, ctx), dense_ks, sparse_ks, - )?; + ) + .context("failed to create filter iterator") + .map_err(CompactionError::Other)?; + + let time_download_layer = timer.elapsed(); + let timer = Instant::now(); // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); @@ -2645,7 +2997,9 @@ impl Timeline { self.get_compaction_target_size(), ctx, ) - .await?, + .await + .context("failed to create image layer writer") + .map_err(CompactionError::Other)?, ) } else { None @@ -2658,7 +3012,9 @@ impl Timeline { lowest_retain_lsn..end_lsn, self.get_compaction_target_size(), ) - .await?; + .await + .context("failed to create delta layer writer") + .map_err(CompactionError::Other)?; #[derive(Default)] struct RewritingLayers { @@ -2698,9 +3054,33 @@ impl Timeline { // the key and LSN range are determined. However, to keep things simple here, we still // create this writer, and discard the writer in the end. - while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? { + let mut keys_processed = 0; + + while let Some(((key, lsn, val), desc)) = merge_iter + .next_with_trace() + .await + .context("failed to get next key-value pair") + .map_err(CompactionError::Other)? + { if cancel.is_cancelled() { - return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error + return Err(CompactionError::ShuttingDown); + } + + if !no_yield { + keys_processed += 1; + if keys_processed % 1000 == 0 { + let should_yield = self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!( + "preempt gc-compaction in the main loop: too many L0 layers" + ); + return Ok(CompactionOutcome::YieldForL0); + } + } } if self.shard_identity.is_key_disposable(&key) { // If this shard does not need to store this key, simply skip it. @@ -2731,7 +3111,9 @@ impl Timeline { desc.lsn_range.clone(), ctx, ) - .await?, + .await + .context("failed to create delta layer writer") + .map_err(CompactionError::Other)?, ); } rewriter.before.as_mut().unwrap() @@ -2746,14 +3128,20 @@ impl Timeline { desc.lsn_range.clone(), ctx, ) - .await?, + .await + .context("failed to create delta layer writer") + .map_err(CompactionError::Other)?, ); } rewriter.after.as_mut().unwrap() } else { unreachable!() }; - rewriter.put_value(key, lsn, val, ctx).await?; + rewriter + .put_value(key, lsn, val, ctx) + .await + .context("failed to put value") + .map_err(CompactionError::Other)?; continue; } match val { @@ -2776,9 +3164,13 @@ impl Timeline { &job_desc.retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn) - .await?, + .await + .context("failed to get ancestor image") + .map_err(CompactionError::Other)?, ) - .await?; + .await + .context("failed to generate key retention") + .map_err(CompactionError::Other)?; retention .pipe_to( *last_key, @@ -2787,7 +3179,9 @@ impl Timeline { &mut stat, ctx, ) - .await?; + .await + .context("failed to pipe to delta layer writer") + .map_err(CompactionError::Other)?; accumulated_values.clear(); *last_key = key; accumulated_values.push((key, lsn, val)); @@ -2795,7 +3189,11 @@ impl Timeline { } // TODO: move the below part to the loop body - let last_key = last_key.expect("no keys produced during compaction"); + let Some(last_key) = last_key else { + return Err(CompactionError::Other(anyhow!( + "no keys produced during compaction" + ))); + }; stat.on_unique_key_visited(); let retention = self @@ -2805,9 +3203,14 @@ impl Timeline { job_desc.gc_cutoff, &job_desc.retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, - get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?, + get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn) + .await + .context("failed to get ancestor image") + .map_err(CompactionError::Other)?, ) - .await?; + .await + .context("failed to generate key retention") + .map_err(CompactionError::Other)?; retention .pipe_to( last_key, @@ -2816,21 +3219,36 @@ impl Timeline { &mut stat, ctx, ) - .await?; + .await + .context("failed to pipe to delta layer writer") + .map_err(CompactionError::Other)?; // end: move the above part to the loop body + let time_main_loop = timer.elapsed(); + let timer = Instant::now(); + let mut rewrote_delta_layers = Vec::new(); for (key, writers) in delta_layer_rewriters { if let Some(delta_writer_before) = writers.before { let (desc, path) = delta_writer_before .finish(job_desc.compaction_key_range.start, ctx) - .await?; - let layer = Layer::finish_creating(self.conf, self, desc, &path)?; + .await + .context("failed to finish delta layer writer") + .map_err(CompactionError::Other)?; + let layer = Layer::finish_creating(self.conf, self, desc, &path) + .context("failed to finish creating delta layer") + .map_err(CompactionError::Other)?; rewrote_delta_layers.push(layer); } if let Some(delta_writer_after) = writers.after { - let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?; - let layer = Layer::finish_creating(self.conf, self, desc, &path)?; + let (desc, path) = delta_writer_after + .finish(key.key_range.end, ctx) + .await + .context("failed to finish delta layer writer") + .map_err(CompactionError::Other)?; + let layer = Layer::finish_creating(self.conf, self, desc, &path) + .context("failed to finish creating delta layer") + .map_err(CompactionError::Other)?; rewrote_delta_layers.push(layer); } } @@ -2845,7 +3263,9 @@ impl Timeline { let end_key = job_desc.compaction_key_range.end; writer .finish_with_discard_fn(self, ctx, end_key, discard) - .await? + .await + .context("failed to finish image layer writer") + .map_err(CompactionError::Other)? } else { drop(writer); Vec::new() @@ -2857,7 +3277,9 @@ impl Timeline { let produced_delta_layers = if !dry_run { delta_layer_writer .finish_with_discard_fn(self, ctx, discard) - .await? + .await + .context("failed to finish delta layer writer") + .map_err(CompactionError::Other)? } else { drop(delta_layer_writer); Vec::new() @@ -2869,6 +3291,13 @@ impl Timeline { let mut keep_layers = HashSet::new(); let produced_delta_layers_len = produced_delta_layers.len(); let produced_image_layers_len = produced_image_layers.len(); + + let layer_selection_by_key = job_desc + .selected_layers + .iter() + .map(|l| (l.layer_desc().key(), l.layer_desc().clone())) + .collect::>(); + for action in produced_delta_layers { match action { BatchWriterResult::Produced(layer) => { @@ -2882,8 +3311,16 @@ impl Timeline { if cfg!(debug_assertions) { info!("discarded delta layer: {}", l); } + if let Some(layer_desc) = layer_selection_by_key.get(&l) { + stat.discard_delta_layer(layer_desc.file_size()); + } else { + tracing::warn!( + "discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?", + l + ); + stat.discard_delta_layer(0); + } keep_layers.insert(l); - stat.discard_delta_layer(); } } } @@ -2892,6 +3329,9 @@ impl Timeline { "produced rewritten delta layer: {}", layer.layer_desc().key() ); + // For now, we include rewritten delta layer size in the "produce_delta_layer". We could + // make it a separate statistics in the future. + stat.produce_delta_layer(layer.layer_desc().file_size()); } compact_to.extend(rewrote_delta_layers); for action in produced_image_layers { @@ -2903,8 +3343,16 @@ impl Timeline { } BatchWriterResult::Discarded(l) => { debug!("discarded image layer: {}", l); + if let Some(layer_desc) = layer_selection_by_key.get(&l) { + stat.discard_image_layer(layer_desc.file_size()); + } else { + tracing::warn!( + "discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?", + l + ); + stat.discard_image_layer(0); + } keep_layers.insert(l); - stat.discard_image_layer(); } } } @@ -2937,7 +3385,9 @@ impl Timeline { &layer.layer_desc().key_range, &job_desc.compaction_key_range, ) { - bail!("violated constraint: image layer outside of compaction key range"); + return Err(CompactionError::Other(anyhow!( + "violated constraint: image layer outside of compaction key range" + ))); } if !fully_contains( &job_desc.compaction_key_range, @@ -2950,13 +3400,25 @@ impl Timeline { layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); + let time_final_phase = timer.elapsed(); + + stat.time_final_phase_secs = time_final_phase.as_secs_f64(); + stat.time_main_loop_secs = time_main_loop.as_secs_f64(); + stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64(); + stat.time_download_layer_secs = time_download_layer.as_secs_f64(); + stat.time_analyze_secs = time_analyze.as_secs_f64(); + stat.time_total_secs = begin_timer.elapsed().as_secs_f64(); + stat.finalize(); + info!( "gc-compaction statistics: {}", - serde_json::to_string(&stat)? + serde_json::to_string(&stat) + .context("failed to serialize gc-compaction statistics") + .map_err(CompactionError::Other)? ); if dry_run { - return Ok(()); + return Ok(CompactionOutcome::Done); } info!( @@ -2991,7 +3453,10 @@ impl Timeline { // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails. if let Some(err) = check_valid_layermap(&final_layers) { - bail!("gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err); + return Err(CompactionError::Other(anyhow!( + "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", + err + ))); } // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only @@ -3043,7 +3508,9 @@ impl Timeline { // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should // be batched into `schedule_compaction_update`. let disk_consistent_lsn = self.disk_consistent_lsn.load(); - self.schedule_uploads(disk_consistent_lsn, None)?; + self.schedule_uploads(disk_consistent_lsn, None) + .context("failed to schedule uploads") + .map_err(CompactionError::Other)?; // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead // of `compact_from`. let compact_from = { @@ -3056,7 +3523,8 @@ impl Timeline { if let Some(to) = compact_to_set.get(&layer.layer_desc().key()) { tracing::info!( "skipping delete {} because found same layer key at different generation {}", - layer, to + layer, + to ); } else { compact_from.push(layer.clone()); @@ -3069,7 +3537,7 @@ impl Timeline { drop(gc_lock); - Ok(()) + Ok(CompactionOutcome::Done) } } @@ -3175,6 +3643,7 @@ impl CompactionJobExecutor for TimelineAdaptor { async fn downcast_delta_layer( &self, layer: &OwnArc, + ctx: &RequestContext, ) -> anyhow::Result> { // this is a lot more complex than a simple downcast... if layer.is_delta() { @@ -3182,7 +3651,7 @@ impl CompactionJobExecutor for TimelineAdaptor { let guard = self.timeline.layers.read().await; guard.get_from_desc(layer) }; - let result = l.download_and_keep_resident().await?; + let result = l.download_and_keep_resident(ctx).await?; Ok(Some(ResidentDeltaLayer(result))) } else { diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 841b2fa1c7..740f590735 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -1,26 +1,27 @@ -use std::{ - ops::{Deref, DerefMut}, - sync::Arc, -}; +use std::ops::{Deref, DerefMut}; +use std::sync::Arc; use anyhow::Context; -use pageserver_api::{models::TimelineState, shard::TenantShardId}; +use pageserver_api::models::TimelineState; +use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use tokio::sync::OwnedMutexGuard; -use tracing::{error, info, info_span, instrument, Instrument}; -use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint}; +use tracing::{Instrument, error, info, info_span, instrument}; +use utils::id::TimelineId; +use utils::{crashsafe, fs_ext, pausable_failpoint}; -use crate::{ - config::PageServerConf, - task_mgr::{self, TaskKind}, - tenant::{ - metadata::TimelineMetadata, - remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, - CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, - TenantManifestError, Timeline, TimelineOrOffloaded, - }, - virtual_file::MaybeFatalIo, +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::task_mgr::{self, TaskKind}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::{ + PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, }; +use crate::tenant::{ + CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, TenantManifestError, + Timeline, TimelineOrOffloaded, +}; +use crate::virtual_file::MaybeFatalIo; /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. @@ -137,6 +138,11 @@ async fn remove_maybe_offloaded_timeline_from_tenant( timelines.remove(&timeline.timeline_id).expect( "timeline that we were deleting was concurrently removed from 'timelines' map", ); + tenant + .scheduled_compaction_tasks + .lock() + .unwrap() + .remove(&timeline.timeline_id); } TimelineOrOffloaded::Offloaded(timeline) => { let offloaded_timeline = timelines_offloaded @@ -286,10 +292,11 @@ impl DeleteTimelineFlow { timeline_id: TimelineId, local_metadata: &TimelineMetadata, remote_client: RemoteTimelineClient, + ctx: &RequestContext, ) -> anyhow::Result<()> { // Note: here we even skip populating layer map. Timeline is essentially uninitialized. // RemoteTimelineClient is the only functioning part. - let timeline = tenant + let (timeline, _timeline_ctx) = tenant .create_timeline_struct( timeline_id, local_metadata, @@ -300,6 +307,9 @@ impl DeleteTimelineFlow { // Thus we need to skip the validation here. CreateTimelineCause::Delete, crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here + None, // doesn't matter what we put here + None, // doesn't matter what we put here + ctx, ) .context("create_timeline_struct")?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index e0084d3eef..ac9d9a4579 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -1,25 +1,29 @@ -use std::{collections::HashSet, sync::Arc}; +use std::collections::HashSet; +use std::sync::Arc; -use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - task_mgr::TaskKind, - tenant::{ - remote_timeline_client::index::GcBlockingReason::DetachAncestor, - storage_layer::{ - layer::local_layer_path, AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer, - }, - Tenant, - }, - virtual_file::{MaybeFatalIo, VirtualFile}, -}; use anyhow::Context; use http_utils::error::ApiError; -use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity}; +use pageserver_api::models::DetachBehavior; +use pageserver_api::models::detach_ancestor::AncestorDetached; +use pageserver_api::shard::ShardIdentity; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn}; +use utils::completion; +use utils::generation::Generation; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::sync::gate::GateError; + +use super::layer_manager::LayerManager; +use super::{FlushLayerError, Timeline}; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::TaskKind; +use crate::tenant::Tenant; +use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor; +use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { @@ -29,6 +33,9 @@ pub(crate) enum Error { #[error("too many ancestors")] TooManyAncestors, + #[error("ancestor is not empty")] + AncestorNotEmpty, + #[error("shutting down, please retry later")] ShuttingDown, @@ -64,9 +71,10 @@ impl Error { where F: Fn(anyhow::Error) -> Error, { + use remote_storage::TimeoutOrCancel; + use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::upload_queue::NotInitialized; - use remote_storage::TimeoutOrCancel; if e.is::() || TimeoutOrCancel::caused_by_cancel(&e) @@ -85,7 +93,9 @@ impl From for ApiError { fn from(value: Error) -> Self { match value { Error::NoAncestor => ApiError::Conflict(value.to_string()), - Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")), + Error::TooManyAncestors | Error::AncestorNotEmpty => { + ApiError::BadRequest(anyhow::anyhow!("{value}")) + } Error::ShuttingDown => ApiError::ShuttingDown, Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")), Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => { @@ -123,7 +133,7 @@ pub(crate) struct PreparedTimelineDetach { layers: Vec, } -/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. +// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. #[derive(Debug)] pub(crate) struct Options { pub(crate) rewrite_concurrency: std::num::NonZeroUsize, @@ -143,7 +153,8 @@ impl Default for Options { #[derive(Debug)] pub(crate) struct Attempt { pub(crate) timeline_id: TimelineId, - + pub(crate) ancestor_timeline_id: TimelineId, + pub(crate) ancestor_lsn: Lsn, _guard: completion::Completion, gate_entered: Option, } @@ -163,25 +174,30 @@ impl Attempt { pub(super) async fn prepare( detached: &Arc, tenant: &Tenant, + behavior: DetachBehavior, options: Options, ctx: &RequestContext, ) -> Result { use Error::*; - let Some((ancestor, ancestor_lsn)) = detached + let Some((mut ancestor, mut ancestor_lsn)) = detached .ancestor_timeline .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { + let ancestor_id; + let ancestor_lsn; let still_in_progress = { let accessor = detached.remote_client.initialized_upload_queue()?; // we are safe to inspect the latest uploaded, because we can only witness this after // restart is complete and ancestor is no more. let latest = accessor.latest_uploaded_index_part(); - if latest.lineage.detached_previous_ancestor().is_none() { + let Some((id, lsn)) = latest.lineage.detached_previous_ancestor() else { return Err(NoAncestor); }; + ancestor_id = id; + ancestor_lsn = lsn; latest .gc_blocking @@ -192,7 +208,8 @@ pub(super) async fn prepare( if still_in_progress { // gc is still blocked, we can still reparent and complete. // we are safe to reparent remaining, because they were locked in in the beginning. - let attempt = continue_with_blocked_gc(detached, tenant).await?; + let attempt = + continue_with_blocked_gc(detached, tenant, ancestor_id, ancestor_lsn).await?; // because the ancestor of detached is already set to none, we have published all // of the layers, so we are still "prepared." @@ -220,13 +237,34 @@ pub(super) async fn prepare( check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; - if ancestor.ancestor_timeline.is_some() { + if let DetachBehavior::MultiLevelAndNoReparent = behavior { + // If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline. + while let Some(ancestor_of_ancestor) = ancestor.ancestor_timeline.clone() { + if ancestor_lsn != ancestor.ancestor_lsn { + // non-technical requirement; we could flatten still if ancestor LSN does not match but that needs + // us to copy and cut more layers. + return Err(AncestorNotEmpty); + } + // Use the ancestor of the ancestor as the new ancestor (only when the ancestor LSNs are the same) + ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable + ancestor = ancestor_of_ancestor; + // TODO: do we still need to check if we don't want to reparent? + check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; + } + } else if ancestor.ancestor_timeline.is_some() { // non-technical requirement; we could flatten N ancestors just as easily but we chose // not to, at least initially return Err(TooManyAncestors); } - let attempt = start_new_attempt(detached, tenant).await?; + tracing::info!( + "attempt to detach the timeline from the ancestor: {}@{}, behavior={:?}", + ancestor.timeline_id, + ancestor_lsn, + behavior + ); + + let attempt = start_new_attempt(detached, tenant, ancestor.timeline_id, ancestor_lsn).await?; utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable"); @@ -360,14 +398,25 @@ pub(super) async fn prepare( let mut tasks = tokio::task::JoinSet::new(); let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get())); + let cancel_eval = CancellationToken::new(); for adopted in rest_of_historic { let limiter = limiter.clone(); let timeline = detached.clone(); + let cancel_eval = cancel_eval.clone(); tasks.spawn( async move { - let _permit = limiter.acquire().await; + let _permit = tokio::select! { + permit = limiter.acquire() => { + permit + } + // Wait for the cancellation here instead of letting the entire task be cancelled. + // Cancellations are racy in that they might leave layers on disk. + _ = cancel_eval.cancelled() => { + Err(Error::ShuttingDown)? + } + }; let (owned, did_hardlink) = remote_copy( &adopted, &timeline, @@ -383,7 +432,22 @@ pub(super) async fn prepare( ); } + fn delete_layers(timeline: &Timeline, layers: Vec) -> Result<(), Error> { + // We are deleting layers, so we must hold the gate + let _gate = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => Error::ShuttingDown, + })?; + { + layers.into_iter().for_each(|l: Layer| { + l.delete_on_drop(); + std::mem::drop(l); + }); + } + Ok(()) + } + let mut should_fsync = false; + let mut first_err = None; while let Some(res) = tasks.join_next().await { match res { Ok(Ok((owned, did_hardlink))) => { @@ -392,13 +456,24 @@ pub(super) async fn prepare( } new_layers.push(owned); } + + // Don't stop the evaluation on errors, so that we get the full set of hardlinked layers to delete. Ok(Err(failed)) => { - return Err(failed); + cancel_eval.cancel(); + first_err.get_or_insert(failed); + } + Err(je) => { + cancel_eval.cancel(); + first_err.get_or_insert(Error::Prepare(je.into())); } - Err(je) => return Err(Error::Prepare(je.into())), } } + if let Some(failed) = first_err { + delete_layers(detached, new_layers)?; + return Err(failed); + } + // fsync directory again if we hardlinked something if should_fsync { fsync_timeline_dir(detached, ctx).await; @@ -409,8 +484,13 @@ pub(super) async fn prepare( Ok(Progress::Prepared(attempt, prepared)) } -async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result { - let attempt = obtain_exclusive_attempt(detached, tenant)?; +async fn start_new_attempt( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { + let attempt = obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn)?; // insert the block in the index_part.json, if not already there. let _dont_care = tenant @@ -425,13 +505,23 @@ async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result Result { +async fn continue_with_blocked_gc( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { // FIXME: it would be nice to confirm that there is an in-memory version, since we've just // verified there is a persistent one? - obtain_exclusive_attempt(detached, tenant) + obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn) } -fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result { +fn obtain_exclusive_attempt( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { use Error::{OtherTimelineDetachOngoing, ShuttingDown}; // ensure we are the only active attempt for this tenant @@ -452,6 +542,8 @@ fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result Error::ShuttingDown, + })?; + // depending if Layer::keep_resident, do a hardlink let did_hardlink; let owned = if let Some(adopted_resident) = adopted.keep_resident().await { @@ -657,8 +754,32 @@ async fn remote_copy( &file_name, &metadata.generation, ); - std::fs::hard_link(adopted_path, &adoptee_path) - .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + + match std::fs::hard_link(adopted_path, &adoptee_path) { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + // In theory we should not get into this situation as we are doing cleanups of the layer file after errors. + // However, we don't do cleanups for errors past `prepare`, so there is the slight chance to get to this branch. + + // Double check that the file is orphan (probably from an earlier attempt), then delete it + let key = file_name.clone().into(); + if adoptee.layers.read().await.contains_key(&key) { + // We are supposed to filter out such cases before coming to this function + return Err(Error::Prepare(anyhow::anyhow!( + "layer file {file_name} already present and inside layer map" + ))); + } + tracing::info!("Deleting orphan layer file to make way for hard linking"); + // Delete orphan layer file and try again, to ensure this layer has a well understood source + std::fs::remove_file(adopted_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + std::fs::hard_link(adopted_path, &adoptee_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + } + Err(e) => { + return Err(Error::launder(e.into(), Error::Prepare)); + } + }; did_hardlink = true; Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard() } else { @@ -666,12 +787,21 @@ async fn remote_copy( Layer::for_evicted(conf, adoptee, file_name, metadata) }; - let layer = adoptee + let layer = match adoptee .remote_client .copy_timeline_layer(adopted, &owned, cancel) .await - .map(move |()| owned) - .map_err(|e| Error::launder(e, Error::Prepare))?; + { + Ok(()) => owned, + Err(e) => { + { + // Clean up the layer so that on a retry we don't get errors that the file already exists + owned.delete_on_drop(); + std::mem::drop(owned); + } + return Err(Error::launder(e, Error::Prepare)); + } + }; Ok((layer, did_hardlink)) } @@ -716,6 +846,9 @@ pub(super) async fn detach_and_reparent( detached: &Arc, tenant: &Tenant, prepared: PreparedTimelineDetach, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, + behavior: DetachBehavior, _ctx: &RequestContext, ) -> Result { let PreparedTimelineDetach { layers } = prepared; @@ -743,7 +876,30 @@ pub(super) async fn detach_and_reparent( "cannot (detach? reparent)? complete if the operation is not still ongoing" ); - let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) { + let ancestor_to_detach = match detached.ancestor_timeline.as_ref() { + Some(mut ancestor) => { + while ancestor.timeline_id != ancestor_timeline_id { + match ancestor.ancestor_timeline.as_ref() { + Some(found) => { + if ancestor_lsn != ancestor.ancestor_lsn { + return Err(Error::DetachReparent(anyhow::anyhow!( + "cannot find the ancestor timeline to detach from: wrong ancestor lsn" + ))); + } + ancestor = found; + } + None => { + return Err(Error::DetachReparent(anyhow::anyhow!( + "cannot find the ancestor timeline to detach from" + ))); + } + } + } + Some(ancestor) + } + None => None, + }; + let ancestor = match (ancestor_to_detach, recorded_branchpoint) { (Some(ancestor), None) => { assert!( !layers.is_empty(), @@ -780,7 +936,7 @@ pub(super) async fn detach_and_reparent( // TODO: make sure there are no `?` before tenant_reset from after a questionmark from // here. panic!( - "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" + "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" ); } }; @@ -816,6 +972,11 @@ pub(super) async fn detach_and_reparent( Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false), }; + if let DetachBehavior::MultiLevelAndNoReparent = behavior { + // Do not reparent if the user requests to behave so. + return Ok(DetachingAndReparenting::Reparented(HashSet::new())); + } + let mut tasks = tokio::task::JoinSet::new(); // Returns a single permit semaphore which will be used to make one reparenting succeed, @@ -953,6 +1114,11 @@ pub(super) async fn complete( } /// Query against a locked `Tenant::timelines`. +/// +/// A timeline is reparentable if: +/// +/// - It is not the timeline being detached. +/// - It has the same ancestor as the timeline being detached. Note that the ancestor might not be the direct ancestor. fn reparentable_timelines<'a, I>( timelines: I, detached: &'a Arc, diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 77c33349e0..397e8e8978 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -13,34 +13,27 @@ //! Items with parentheses are not (yet) touched by this task. //! //! See write-up on restart on-demand download spike: -use std::{ - collections::HashMap, - ops::ControlFlow, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::collections::HashMap; +use std::ops::ControlFlow; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, instrument, warn, Instrument}; - -use crate::{ - context::{DownloadBehavior, RequestContext}, - pgdatadir_mapping::CollectKeySpaceError, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - tenant::{ - size::CalculateSyntheticSizeError, - storage_layer::LayerVisibilityHint, - tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit}, - timeline::EvictionError, - LogicalSizeCalculationCause, Tenant, - }, -}; - -use utils::{completion, sync::gate::GateGuard}; +use tracing::{Instrument, debug, info, info_span, instrument, warn}; +use utils::completion; +use utils::sync::gate::GateGuard; use super::Timeline; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::pgdatadir_mapping::CollectKeySpaceError; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; +use crate::tenant::size::CalculateSyntheticSizeError; +use crate::tenant::storage_layer::LayerVisibilityHint; +use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random}; +use crate::tenant::timeline::EvictionError; +use crate::tenant::{LogicalSizeCalculationCause, Tenant}; #[derive(Default)] pub struct EvictionTaskTimelineState { @@ -100,7 +93,8 @@ impl Timeline { } } - let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn); + let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn) + .with_scope_timeline(&self); loop { let policy = self.get_eviction_policy(); let cf = self diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 5b39daaaf8..809b350f38 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -1,5 +1,4 @@ -//! An efficient way to keep the timeline gate open without preventing -//! timeline shutdown for longer than a single call to a timeline method. +//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`. //! //! # Motivation //! @@ -19,27 +18,32 @@ //! we hold the Timeline gate open while we're invoking the method on the //! Timeline object. //! -//! However, we want to avoid the overhead of entering the gate for every -//! method invocation. -//! -//! Further, for shard routing, we want to avoid calling the tenant manager to -//! resolve the shard for every request. Instead, we want to cache the -//! routing result so we can bypass the tenant manager for all subsequent requests -//! that get routed to that shard. +//! We want to avoid the overhead of doing, for each incoming request, +//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing) +//! - cloning the `Arc` out of the tenant manager so we can +//! release the mgr rwlock before doing any request processing work +//! - re-entering the Timeline gate for each Timeline method invocation. //! //! Regardless of how we accomplish the above, it should not //! prevent the Timeline from shutting down promptly. //! +//! //! # Design //! //! ## Data Structures //! -//! There are three user-facing data structures: +//! There are two concepts expressed as associated types in the `Types` trait: +//! - `TenantManager`: the thing that performs the expensive work. It produces +//! a `Timeline` object, which is the other associated type. +//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup. +//! +//! There are three user-facing data structures exposed by this module: //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. -//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`. +//! - `Handle`: a smart pointer that derefs to the Types::Timeline. //! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows -//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*. +//! trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always +//! point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`. //! //! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`. //! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`. @@ -64,11 +68,14 @@ //! //! To dispatch a request, the page service connection calls `Cache::get`. //! -//! A cache miss means we consult the tenant manager for shard routing, -//! resulting in an `Arc`. We enter its gate _once_ and store it in the the -//! `Arc>>`. A weak ref is stored in the `Cache` +//! A cache miss means we call Types::TenantManager::resolve for shard routing, +//! cloning the `Arc` out of it, and entering the gate. The result of +//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls. +//! +//! We wrap the object returned from resolve() in an `Arc` and store that inside the +//! `Arc>>`. A weak ref to the HandleInner is stored in the `Cache` //! and a strong ref in the `PerTimelineState`. -//! A strong ref is returned wrapped in a `Handle`. +//! Another strong ref is returned wrapped in a `Handle`. //! //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing //! and find the weak ref in the cache. @@ -78,51 +85,51 @@ //! While a request is batching, the `Handle` is downgraded to a `WeakHandle`. //! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle` //! and the request handler dispatches the request to the right `>::$request_method`. -//! It then drops the `Handle`, which drops the `Arc`. +//! It then drops the `Handle`, and thus the `Arc>` inside it. //! //! # Performance //! //! Remember from the introductory section: //! -//! > However, we want to avoid the overhead of entering the gate for every -//! > method invocation. +//! > We want to avoid the overhead of doing, for each incoming request, +//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing) +//! > - cloning the `Arc` out of the tenant manager so we can +//! > release the mgr rwlock before doing any request processing work +//! > - re-entering the Timeline gate for each Timeline method invocation. //! -//! Why do we want to avoid that? -//! Because the gate is a shared location in memory and entering it involves -//! bumping refcounts, which leads to cache contention if done frequently -//! from multiple cores in parallel. +//! All of these boil down to some state that is either globally shared among all shards +//! or state shared among all tasks that serve a particular timeline. +//! It is either protected by RwLock or manipulated via atomics. +//! Even atomics are costly when shared across multiple cores. +//! So, we want to avoid any permanent need for coordination between page_service tasks. //! -//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`. -//! That `Arc` is private to the `HandleInner` and hence to the connection. +//! The solution is to add indirection: we wrap the Types::Timeline object that is +//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner` +//! and hence to the single Cache / page_service connection. //! (Review the "Data Structures" section if that is unclear to you.) //! -//! A `WeakHandle` is a weak ref to the `HandleInner`. -//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and -//! further acquire an additional strong ref to the `Arc` inside it. -//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection. //! -//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc`. -//! Again, this is cheap because the `Arc` is private to the connection. +//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex`), +//! lock the mutex, take out a clone of the `Arc`, and drop the Mutex. +//! The Mutex is not contended because it is private to the connection. +//! And again, the `Arc` clone is cheap because that wrapper +//! Arc's refcounts are private to the connection. +//! +//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection. //! -//! In addition to the GateGuard, we need to provide `Deref` impl. -//! For this, both `Handle` need infallible access to an `Arc`. -//! We could clone the `Arc` when upgrading a `WeakHandle`, but that would cause contention -//! on the shared memory location that trakcs the refcount of the `Arc`. -//! Instead, we wrap the `Arc` into another `Arc`. -//! so that we can clone it cheaply when upgrading a `WeakHandle`. //! //! # Shutdown //! //! The attentive reader may have noticed the following reference cycle around the `Arc`: //! //! ```text -//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline //! ``` //! //! Further, there is this cycle: //! //! ```text -//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline //! ``` //! //! The former cycle is a memory leak if not broken. @@ -135,9 +142,12 @@ //! - Timeline shutdown (=> `PerTimelineState::shutdown`) //! - Connection shutdown (=> dropping the `Cache`). //! -//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to -//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the -//! `Arc`. +//! Both transition the `HandleInner` from [`HandleInner::Open`] to +//! [`HandleInner::ShutDown`], which drops the only long-lived +//! `Arc`. Once the last short-lived Arc +//! is dropped, the `Types::Timeline` gets dropped and thereby +//! the `GateGuard` and the `Arc` that it stores, +//! thereby breaking both cycles. //! //! `PerTimelineState::shutdown` drops all the `HandleInners` it contains, //! thereby breaking the cycle. @@ -202,18 +212,13 @@ //! to the parent shard during a shard split. Eventually, the shard split task will //! shut down the parent => case (1). -use std::collections::hash_map; -use std::collections::HashMap; -use std::sync::Arc; -use std::sync::Mutex; -use std::sync::Weak; +use std::collections::{HashMap, hash_map}; +use std::sync::{Arc, Mutex, Weak}; use pageserver_api::shard::ShardIdentity; -use tracing::instrument; -use tracing::trace; +use tracing::{instrument, trace}; use utils::id::TimelineId; -use utils::shard::ShardIndex; -use utils::shard::ShardNumber; +use utils::shard::{ShardIndex, ShardNumber}; use crate::tenant::mgr::ShardSelector; @@ -221,7 +226,7 @@ use crate::tenant::mgr::ShardSelector; pub(crate) trait Types: Sized + std::fmt::Debug { type TenantManagerError: Sized + std::fmt::Debug; type TenantManager: TenantManager + Sized; - type Timeline: ArcTimeline + Sized; + type Timeline: Timeline + Sized; } /// Uniquely identifies a [`Cache`] instance over the lifetime of the process. @@ -266,20 +271,15 @@ pub(crate) struct ShardTimelineId { /// See module-level comment. pub(crate) struct Handle { - timeline: Arc, - #[allow(dead_code)] // the field exists to keep the gate open - gate_guard: Arc, inner: Arc>>, + open: Arc, } pub(crate) struct WeakHandle { inner: Weak>>, } + enum HandleInner { - KeepingTimelineGateOpen { - #[allow(dead_code)] - gate_guard: Arc, - timeline: Arc, - }, + Open(Arc), ShutDown, } @@ -312,8 +312,7 @@ pub(crate) trait TenantManager { } /// Abstract view of an [`Arc`], for testability. -pub(crate) trait ArcTimeline: Clone { - fn gate(&self) -> &utils::sync::gate::Gate; +pub(crate) trait Timeline { fn shard_timeline_id(&self) -> ShardTimelineId; fn get_shard_identity(&self) -> &ShardIdentity; fn per_timeline_state(&self) -> &PerTimelineState; @@ -323,7 +322,6 @@ pub(crate) trait ArcTimeline: Clone { #[derive(Debug)] pub(crate) enum GetError { TenantManager(T::TenantManagerError), - TimelineGateClosed, PerTimelineStateShutDown, } @@ -439,21 +437,9 @@ impl Cache { } trace!("creating new HandleInner"); - let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen { - gate_guard: Arc::new( - // this enter() is expensive in production code because - // it hits the global Arc::gate refcounts - match timeline.gate().enter() { - Ok(guard) => guard, - Err(_) => { - return Err(GetError::TimelineGateClosed); - } - }, - ), - // this clone is expensive in production code because - // it hits the global Arc::clone refcounts - timeline: Arc::new(timeline.clone()), - })); + let timeline = Arc::new(timeline); + let handle_inner_arc = + Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline)))); let handle_weak = WeakHandle { inner: Arc::downgrade(&handle_inner_arc), }; @@ -508,18 +494,10 @@ impl WeakHandle { }; let lock_guard = inner.lock().expect("poisoned"); match &*lock_guard { - HandleInner::KeepingTimelineGateOpen { - timeline, - gate_guard, - } => { - let gate_guard = Arc::clone(gate_guard); - let timeline = Arc::clone(timeline); + HandleInner::Open(open) => { + let open = Arc::clone(open); drop(lock_guard); - Ok(Handle { - timeline, - gate_guard, - inner, - }) + Ok(Handle { open, inner }) } HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown), } @@ -533,7 +511,7 @@ impl WeakHandle { impl std::ops::Deref for Handle { type Target = T::Timeline; fn deref(&self) -> &Self::Target { - &self.timeline + &self.open } } @@ -550,7 +528,7 @@ impl PerTimelineState { /// to the [`Types::Timeline`] that embeds this per-timeline state. /// Even if [`TenantManager::resolve`] would still resolve to it. /// - /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive. + /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive. /// That's ok because they're short-lived. See module-level comment for details. #[instrument(level = "trace", skip_all)] pub(super) fn shutdown(&self) { @@ -616,7 +594,7 @@ impl Drop for Cache { impl HandleInner { fn shutdown(&mut self) -> Option> { match std::mem::replace(self, HandleInner::ShutDown) { - HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline), + HandleInner::Open(timeline) => Some(timeline), HandleInner::ShutDown => { // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown // may do it concurrently, but locking rules disallow holding per-timeline-state lock and @@ -631,13 +609,12 @@ impl HandleInner { mod tests { use std::sync::Weak; - use pageserver_api::{ - key::{rel_block_to_key, Key, DBDIR_KEY}, - models::ShardParameters, - reltag::RelTag, - shard::ShardStripeSize, - }; + use pageserver_api::key::{DBDIR_KEY, Key, rel_block_to_key}; + use pageserver_api::models::ShardParameters; + use pageserver_api::reltag::RelTag; + use pageserver_api::shard::ShardStripeSize; use utils::shard::ShardCount; + use utils::sync::gate::GateGuard; use super::*; @@ -648,7 +625,7 @@ mod tests { impl Types for TestTypes { type TenantManagerError = anyhow::Error; type TenantManager = StubManager; - type Timeline = Arc; + type Timeline = Entered; } struct StubManager { @@ -663,17 +640,19 @@ mod tests { myself: Weak, } + struct Entered { + timeline: Arc, + #[allow(dead_code)] // it's stored here to keep the gate open + gate_guard: Arc, + } + impl StubTimeline { fn getpage(&self) { // do nothing } } - impl ArcTimeline for Arc { - fn gate(&self) -> &utils::sync::gate::Gate { - &self.gate - } - + impl Timeline for Entered { fn shard_timeline_id(&self) -> ShardTimelineId { ShardTimelineId { shard_index: self.shard.shard_index(), @@ -695,20 +674,34 @@ mod tests { &self, timeline_id: TimelineId, shard_selector: ShardSelector, - ) -> anyhow::Result> { + ) -> anyhow::Result { for timeline in &self.shards { if timeline.id == timeline_id { + let enter_gate = || { + let gate_guard = timeline.gate.enter()?; + let gate_guard = Arc::new(gate_guard); + anyhow::Ok(gate_guard) + }; match &shard_selector { ShardSelector::Zero if timeline.shard.is_shard_zero() => { - return Ok(Arc::clone(timeline)); + return Ok(Entered { + timeline: Arc::clone(timeline), + gate_guard: enter_gate()?, + }); } ShardSelector::Zero => continue, ShardSelector::Page(key) if timeline.shard.is_key_local(key) => { - return Ok(Arc::clone(timeline)); + return Ok(Entered { + timeline: Arc::clone(timeline), + gate_guard: enter_gate()?, + }); } ShardSelector::Page(_) => continue, ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => { - return Ok(Arc::clone(timeline)); + return Ok(Entered { + timeline: Arc::clone(timeline), + gate_guard: enter_gate()?, + }); } ShardSelector::Known(_) => continue, } @@ -718,6 +711,13 @@ mod tests { } } + impl std::ops::Deref for Entered { + type Target = StubTimeline; + fn deref(&self) -> &Self::Target { + &self.timeline + } + } + #[tokio::test(start_paused = true)] async fn test_timeline_shutdown() { crate::tenant::harness::setup_logging(); @@ -1045,7 +1045,6 @@ mod tests { let key = DBDIR_KEY; // Simulate 10 connections that's opened, used, and closed - let mut used_handles = vec![]; for _ in 0..10 { let mut cache = Cache::::default(); let handle = { @@ -1057,7 +1056,6 @@ mod tests { handle }; handle.getpage(); - used_handles.push(Arc::downgrade(&handle.timeline)); } // No handles exist, thus gates are closed and don't require shutdown. diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs index 0ba9753e85..11df232a10 100644 --- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs +++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs @@ -3,12 +3,15 @@ //! Provides utilities to spawn and abort a background task where the downloads happen. //! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers. +use std::sync::{Arc, Mutex}; + use futures::StreamExt; use http_utils::error::ApiError; -use std::sync::{Arc, Mutex}; use tokio_util::sync::CancellationToken; use utils::sync::gate::Gate; +use crate::context::RequestContext; + use super::Timeline; // This status is not strictly necessary now, but gives us a nice place @@ -29,6 +32,8 @@ impl HeatmapLayersDownloader { fn new( timeline: Arc, concurrency: usize, + recurse: bool, + ctx: RequestContext, ) -> Result { let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?; @@ -56,12 +61,13 @@ impl HeatmapLayersDownloader { tracing::info!( resident_size=%timeline.resident_physical_size(), - heatmap_layers=%heatmap.layers.len(), + heatmap_layers=%heatmap.all_layers().count(), "Starting heatmap layers download" ); - let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map( + let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map( |layer| { + let ctx = ctx.attached_child(); let tl = timeline.clone(); let dl_guard = match downloads_guard.enter() { Ok(g) => g, @@ -74,7 +80,7 @@ impl HeatmapLayersDownloader { Some(async move { let _dl_guard = dl_guard; - let res = tl.download_layer(&layer.name).await; + let res = tl.download_layer(&layer.name, &ctx).await; if let Err(err) = res { if !err.is_cancelled() { tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}") @@ -93,6 +99,20 @@ impl HeatmapLayersDownloader { }, _ = cancel.cancelled() => { tracing::info!("Heatmap layers download cancelled"); + return; + } + } + + if recurse { + if let Some(ancestor) = timeline.ancestor_timeline() { + let ctx = ctx.attached_child(); + let res = + ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx); + if let Err(err) = res { + tracing::info!( + "Failed to start heatmap layers download for ancestor: {err}" + ); + } } } } @@ -135,13 +155,20 @@ impl HeatmapLayersDownloader { } impl Timeline { - pub(crate) async fn start_heatmap_layers_download( + pub(crate) fn start_heatmap_layers_download( self: &Arc, concurrency: usize, + recurse: bool, + ctx: &RequestContext, ) -> Result<(), ApiError> { let mut locked = self.heatmap_layers_downloader.lock().unwrap(); if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) { - let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?; + let dl = HeatmapLayersDownloader::new( + self.clone(), + concurrency, + recurse, + ctx.attached_child(), + )?; *locked = Some(dl); Ok(()) } else { diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 6940179ae9..8b94a114d6 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -1,14 +1,14 @@ use std::sync::Arc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; -use tracing::{info, info_span, Instrument}; +use tracing::{Instrument, info, info_span}; use utils::lsn::Lsn; -use crate::{context::RequestContext, tenant::metadata::TimelineMetadata}; - use super::Timeline; +use crate::context::RequestContext; +use crate::tenant::metadata::TimelineMetadata; mod flow; mod importbucket_client; diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 4388072606..3ef82b3658 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -28,52 +28,38 @@ //! An incomplete set of TODOs from the Hackathon: //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) +use std::collections::HashSet; +use std::ops::Range; use std::sync::Arc; use anyhow::{bail, ensure}; use bytes::Bytes; - use itertools::Itertools; -use pageserver_api::{ - key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY}, - reltag::RelTag, - shard::ShardIdentity, -}; -use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ}; -use tokio::task::JoinSet; -use tracing::{debug, info_span, instrument, Instrument}; - -use crate::{ - assert_u64_eq_usize::UsizeIsU64, - pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory}, -}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - pgdatadir_mapping::{DbDirectory, RelDirectory}, - task_mgr::TaskKind, - tenant::storage_layer::{ImageLayerWriter, Layer}, -}; - -use pageserver_api::key::Key; use pageserver_api::key::{ - slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY, - TWOPHASEDIR_KEY, + CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key, + rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, + slru_segment_size_to_key, }; -use pageserver_api::keyspace::singleton_range; -use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range}; -use pageserver_api::reltag::SlruKind; +use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range}; +use pageserver_api::reltag::{RelTag, SlruKind}; +use pageserver_api::shard::ShardIdentity; +use postgres_ffi::relfile_utils::parse_relfilename; +use postgres_ffi::{BLCKSZ, pg_constants}; +use remote_storage::RemotePath; +use tokio::task::JoinSet; +use tracing::{Instrument, debug, info_span, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; -use std::collections::HashSet; -use std::ops::Range; - -use super::{ - importbucket_client::{ControlFile, RemoteStorageWrapper}, - Timeline, +use super::Timeline; +use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; +use crate::assert_u64_eq_usize::UsizeIsU64; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::pgdatadir_mapping::{ + DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory, }; - -use remote_storage::RemotePath; +use crate::task_mgr::TaskKind; +use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index 68937e535d..a17a10d56b 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -1,4 +1,5 @@ -use std::{ops::Bound, sync::Arc}; +use std::ops::Bound; +use std::sync::Arc; use anyhow::Context; use bytes::Bytes; @@ -12,9 +13,9 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, info, instrument}; use utils::lsn::Lsn; -use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf}; - use super::{importbucket_format, index_part_format}; +use crate::assert_u64_eq_usize::U64IsUsize; +use crate::config::PageServerConf; pub async fn new( conf: &'static PageServerConf, diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs index 310d97a6a9..ea7a41b25f 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -1,7 +1,6 @@ -use serde::{Deserialize, Serialize}; - #[cfg(feature = "testing")] use camino::Utf8PathBuf; +use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum Root { diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs index c5210f9a30..7c7a4de2fc 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs @@ -1,13 +1,12 @@ //! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate. use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt}; +use reqwest::Method; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use tracing::error; -use crate::config::PageServerConf; -use reqwest::Method; - use super::importbucket_format::Spec; +use crate::config::PageServerConf; pub struct Client { base_url: String, diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 6634d07a0d..e952df0845 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -1,22 +1,16 @@ -use crate::{ - is_temporary, - tenant::{ - ephemeral_file::is_ephemeral_file, - remote_timeline_client::{ - self, - index::{IndexPart, LayerFileMetadata}, - }, - storage_layer::LayerName, - }, -}; +use std::collections::{HashMap, hash_map}; +use std::str::FromStr; + use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use std::{ - collections::{hash_map, HashMap}, - str::FromStr, -}; use utils::lsn::Lsn; +use crate::is_temporary; +use crate::tenant::ephemeral_file::is_ephemeral_file; +use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; +use crate::tenant::remote_timeline_client::{self}; +use crate::tenant::storage_layer::LayerName; + /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 60e36a5d4d..1b489028dc 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,28 +1,23 @@ -use anyhow::{bail, ensure, Context}; +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::{Context, bail, ensure}; use itertools::Itertools; use pageserver_api::shard::TenantShardId; -use std::{collections::HashMap, sync::Arc}; use tracing::trace; -use utils::{ - id::TimelineId, - lsn::{AtomicLsn, Lsn}, -}; +use utils::id::TimelineId; +use utils::lsn::{AtomicLsn, Lsn}; -use crate::{ - config::PageServerConf, - context::RequestContext, - metrics::TimelineMetrics, - tenant::{ - layer_map::{BatchedUpdates, LayerMap}, - storage_layer::{ - AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, - PersistentLayerKey, ResidentLayer, - }, - }, +use super::{ReadableLayer, TimelineWriterState}; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::metrics::TimelineMetrics; +use crate::tenant::layer_map::{BatchedUpdates, LayerMap}; +use crate::tenant::storage_layer::{ + AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, + PersistentLayerKey, ReadableLayerWeak, ResidentLayer, }; -use super::TimelineWriterState; - /// Provides semantic APIs to manipulate the layer map. pub(crate) enum LayerManager { /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate @@ -42,6 +37,21 @@ impl Default for LayerManager { } impl LayerManager { + pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer { + match weak { + ReadableLayerWeak::PersistentLayer(desc) => { + ReadableLayer::PersistentLayer(self.get_from_desc(&desc)) + } + ReadableLayerWeak::InMemoryLayer(desc) => { + let inmem = self + .layer_map() + .expect("no concurrent shutdown") + .in_memory_layer(&desc); + ReadableLayer::InMemoryLayer(inmem) + } + } + } + pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer { // The assumption for the `expect()` is that all code maintains the following invariant: // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. @@ -214,9 +224,7 @@ impl OpenLayerManager { trace!( "creating in-memory layer at {}/{} for record at {}", - timeline_id, - start_lsn, - lsn + timeline_id, start_lsn, lsn ); let new_layer = @@ -477,6 +485,25 @@ impl OpenLayerManager { mapping.remove(layer); layer.delete_on_drop(); } + + #[cfg(test)] + pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc) { + use pageserver_api::models::InMemoryLayerInfo; + + match layer.info() { + InMemoryLayerInfo::Open { .. } => { + assert!(self.layer_map.open_layer.is_none()); + self.layer_map.open_layer = Some(layer); + } + InMemoryLayerInfo::Frozen { lsn_start, .. } => { + if let Some(last) = self.layer_map.frozen_layers.back() { + assert!(last.get_lsn_range().end <= lsn_start); + } + + self.layer_map.frozen_layers.push_back(layer); + } + } + } } pub(crate) struct LayerFileManager(HashMap); diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index f4a4eea54a..397037ca9f 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -1,11 +1,10 @@ -use anyhow::Context; +use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; +use anyhow::Context; use once_cell::sync::OnceCell; use tokio_util::sync::CancellationToken; use utils::lsn::Lsn; -use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; - /// Internal structure to hold all data needed for logical size calculation. /// /// Calculation consists of two stages: diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 93e5a1100d..43ffaa6aab 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -2,11 +2,11 @@ use std::sync::Arc; use pageserver_api::models::{TenantState, TimelineState}; -use super::delete::{delete_local_timeline_directory, DeletionGuard}; use super::Timeline; +use super::delete::{DeletionGuard, delete_local_timeline_directory}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; -use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind}; +use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard}; use crate::tenant::{ DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded, }; @@ -143,5 +143,12 @@ fn remove_timeline_from_tenant( .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); + // Clear the compaction queue for this timeline + tenant + .scheduled_compaction_tasks + .lock() + .unwrap() + .remove(&timeline.timeline_id); + Arc::strong_count(&timeline) } diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 3074463384..f66c0ffa0f 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -1,18 +1,21 @@ -use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc}; +use std::collections::hash_map::Entry; +use std::fs; +use std::future::Future; +use std::sync::Arc; use anyhow::Context; use camino::Utf8PathBuf; use tracing::{error, info, info_span}; -use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard}; - -use crate::{ - context::RequestContext, - import_datadir, - span::debug_assert_current_span_has_tenant_and_timeline_id, - tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded}, -}; +use utils::fs_ext; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::sync::gate::GateGuard; use super::Timeline; +use crate::context::RequestContext; +use crate::import_datadir; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded}; /// A timeline with some of its files on disk, being initialized. /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or @@ -128,7 +131,7 @@ impl<'t> UninitializedTimeline<'t> { // We do not call Self::abort here. Because we don't cleanly shut down our Timeline, [`Self::drop`] should // skip trying to delete the timeline directory too. anyhow::bail!( - "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" + "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" ) } Entry::Vacant(v) => { diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 67429bff98..4f80073cc3 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -23,17 +23,11 @@ mod connection_manager; mod walreceiver_connection; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::timeline::walreceiver::connection_manager::{ - connection_manager_loop_step, ConnectionManagerState, -}; - use std::future::Future; use std::num::NonZeroU64; use std::sync::Arc; use std::time::Duration; + use storage_broker::BrokerClientChannel; use tokio::sync::watch; use tokio_util::sync::CancellationToken; @@ -41,8 +35,13 @@ use tracing::*; use utils::postgres_client::PostgresClientProtocol; use self::connection_manager::ConnectionManagerStatus; - use super::Timeline; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::timeline::walreceiver::connection_manager::{ + ConnectionManagerState, connection_manager_loop_step, +}; #[derive(Clone)] pub struct WalReceiverConf { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 1955345315..df2663f6bb 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -9,45 +9,42 @@ //! then a (re)connection happens, if necessary. //! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel. -use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration}; +use std::collections::HashMap; +use std::num::NonZeroU64; +use std::ops::ControlFlow; +use std::sync::Arc; +use std::time::Duration; -use super::{TaskStateUpdate, WalReceiverConf}; +use anyhow::Context; +use chrono::{NaiveDateTime, Utc}; +use pageserver_api::models::TimelineState; +use postgres_connection::PgConnectionConfig; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, + SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, + TypedMessage, +}; +use storage_broker::{BrokerClientChannel, Code, Streaming}; +use tokio_util::sync::CancellationToken; +use tracing::*; +use utils::backoff::{ + DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff, +}; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; +use utils::postgres_client::{ + ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config, +}; + +use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError}; +use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf}; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{ WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, }; use crate::task_mgr::TaskKind; -use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline}; -use anyhow::Context; -use chrono::{NaiveDateTime, Utc}; -use pageserver_api::models::TimelineState; - -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; -use storage_broker::proto::{ - FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, - SubscribeByFilterRequest, TypeSubscription, TypedMessage, -}; -use storage_broker::{BrokerClientChannel, Code, Streaming}; -use tokio_util::sync::CancellationToken; -use tracing::*; - -use postgres_connection::PgConnectionConfig; -use utils::backoff::{ - exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use utils::postgres_client::{ - wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol, -}; -use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, -}; - -use super::{ - walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError, - TaskEvent, TaskHandle, -}; +use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id}; pub(crate) struct Cancelled; @@ -349,7 +346,9 @@ async fn subscribe_for_timeline_updates( Err(e) => { // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error. - info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"); + info!( + "Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}" + ); continue; } } @@ -512,11 +511,11 @@ impl ConnectionManagerState { fn spawn( &self, task: impl FnOnce( - tokio::sync::watch::Sender>, - CancellationToken, - ) -> Fut - + Send - + 'static, + tokio::sync::watch::Sender>, + CancellationToken, + ) -> Fut + + Send + + 'static, ) -> TaskHandle where Fut: std::future::Future> + Send, @@ -880,8 +879,7 @@ impl ConnectionManagerState { discovered_new_wal = if candidate_commit_lsn > current_commit_lsn { trace!( "New candidate has commit_lsn {}, higher than current_commit_lsn {}", - candidate_commit_lsn, - current_commit_lsn + candidate_commit_lsn, current_commit_lsn ); Some(NewCommittedWAL { lsn: candidate_commit_lsn, @@ -1048,7 +1046,9 @@ impl ConnectionManagerState { if !node_ids_to_remove.is_empty() { for node_id in node_ids_to_remove { - info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"); + info!( + "Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections" + ); self.wal_connection_retries.remove(&node_id); WALRECEIVER_CANDIDATES_REMOVED.inc(); } @@ -1119,11 +1119,12 @@ impl ReconnectReason { #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL; use url::Host; + use super::*; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + fn dummy_broker_sk_timeline( commit_lsn: u64, safekeeper_connstr: &str, diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index bb34a181da..f41a9cfe82 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -1,46 +1,48 @@ //! Actual Postgres connection handler to stream WAL to the server. -use std::{ - error::Error, - pin::pin, - str::FromStr, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::error::Error; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use bytes::BytesMut; use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; -use postgres_ffi::WAL_SEGMENT_SIZE; -use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError}; -use postgres_protocol::message::backend::ReplicationMessage; -use postgres_types::PgLsn; -use tokio::{select, sync::watch, time}; -use tokio_postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow}; -use tokio_postgres::{replication::ReplicationStream, Client}; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, trace, warn, Instrument}; -use wal_decoder::{ - models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords}, - wire_format::FromWireFormat, -}; - -use super::TaskStateUpdate; -use crate::{ - context::RequestContext, - metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - pgdatadir_mapping::DatadirModification, - task_mgr::{TaskKind, WALRECEIVER_RUNTIME}, - tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, - walingest::WalIngest, -}; use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; -use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol}; -use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; +use postgres_ffi::WAL_SEGMENT_SIZE; +use postgres_ffi::v14::xlog_utils::normalize_lsn; +use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; +use postgres_protocol::message::backend::ReplicationMessage; +use postgres_types::PgLsn; +use tokio::sync::watch; +use tokio::{select, time}; +use tokio_postgres::error::SqlState; +use tokio_postgres::replication::ReplicationStream; +use tokio_postgres::{Client, SimpleQueryMessage, SimpleQueryRow}; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, debug, error, info, trace, warn}; +use utils::critical; +use utils::id::NodeId; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; +use utils::postgres_client::PostgresClientProtocol; +use utils::sync::gate::GateError; +use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords}; +use wal_decoder::wire_format::FromWireFormat; + +use super::TaskStateUpdate; +use crate::context::RequestContext; +use crate::metrics::{LIVE_CONNECTIONS, WAL_INGEST, WALRECEIVER_STARTED_CONNECTIONS}; +use crate::pgdatadir_mapping::DatadirModification; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::{ + Timeline, WalReceiverInfo, debug_assert_current_span_has_tenant_and_timeline_id, +}; +use crate::walingest::WalIngest; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -149,7 +151,9 @@ pub(super) async fn handle_walreceiver_connection( // Timing out to connect to a safekeeper node could happen long time, due to // many reasons that pageserver cannot control. // Do not produce an error, but make it visible, that timeouts happen by logging the `event. - info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open"); + info!( + "Timed out while waiting {connect_timeout:?} for walreceiver connection to open" + ); return Ok(()); } } @@ -166,7 +170,9 @@ pub(super) async fn handle_walreceiver_connection( node: safekeeper_node, }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { - warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); + warn!( + "Wal connection event listener dropped right after connection init, aborting the connection: {e}" + ); return Ok(()); } @@ -227,7 +233,9 @@ pub(super) async fn handle_walreceiver_connection( connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { - warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); + warn!( + "Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}" + ); return Ok(()); } @@ -254,7 +262,9 @@ pub(super) async fn handle_walreceiver_connection( // to the safekeepers. startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE); - info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..."); + info!( + "last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..." + ); let query = format!("START_REPLICATION PHYSICAL {startpoint}"); @@ -626,7 +636,9 @@ pub(super) async fn handle_walreceiver_connection( let timestamp = keepalive.timestamp(); let reply_requested = keepalive.reply() != 0; - trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})"); + trace!( + "received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})" + ); if reply_requested { Some(last_rec_lsn) diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index d302205ffe..d5dc9666ce 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,21 +1,18 @@ use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Debug; -use std::sync::atomic::AtomicU32; use std::sync::Arc; - -use super::remote_timeline_client::is_same_remote_layer_path; -use super::storage_layer::AsLayerDesc as _; -use super::storage_layer::LayerName; -use super::storage_layer::ResidentLayer; -use crate::tenant::metadata::TimelineMetadata; -use crate::tenant::remote_timeline_client::index::IndexPart; -use crate::tenant::remote_timeline_client::index::LayerFileMetadata; -use utils::generation::Generation; -use utils::lsn::{AtomicLsn, Lsn}; +use std::sync::atomic::AtomicU32; use chrono::NaiveDateTime; use once_cell::sync::Lazy; use tracing::info; +use utils::generation::Generation; +use utils::lsn::{AtomicLsn, Lsn}; + +use super::remote_timeline_client::is_same_remote_layer_path; +use super::storage_layer::{AsLayerDesc as _, LayerName, ResidentLayer}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; /// Kill switch for upload queue reordering in case it causes problems. /// TODO: remove this once we have confidence in it. @@ -225,7 +222,7 @@ impl UploadQueueInitialized { // most one of them can be an index upload (enforced by can_bypass). .scan(&self.clean.0, |next_active_index, op| { let active_index = *next_active_index; - if let UploadOp::UploadMetadata { ref uploaded } = op { + if let UploadOp::UploadMetadata { uploaded } = op { *next_active_index = uploaded; // stash index for next operation after this } Some((op, active_index)) @@ -562,16 +559,18 @@ impl UploadOp { #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; - use crate::tenant::storage_layer::layer::local_layer_path; - use crate::tenant::storage_layer::Layer; - use crate::tenant::Timeline; - use crate::DEFAULT_PG_VERSION; - use itertools::Itertools as _; use std::str::FromStr as _; + + use itertools::Itertools as _; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::Timeline; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::Layer; + use crate::tenant::storage_layer::layer::local_layer_path; + /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq. #[track_caller] fn assert_same_op(a: &UploadOp, b: &UploadOp) { @@ -690,10 +689,22 @@ mod tests { let tli = make_timeline(); let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let (barrier, _) = tokio::sync::watch::channel(()); // Enqueue non-conflicting upload, delete, and index before and after a barrier. @@ -757,10 +768,22 @@ mod tests { let tli = make_timeline(); // Enqueue a bunch of deletes, some with conflicting names. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::Delete(Delete { @@ -802,9 +825,21 @@ mod tests { let tli = make_timeline(); // Enqueue three versions of the same layer, with different file sizes. - let layer0a = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1); - let layer0b = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2); - let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3); + let layer0a = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 1, + ); + let layer0b = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 2, + ); + let layer0c = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 3, + ); let ops = [ UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None), @@ -836,8 +871,14 @@ mod tests { // Enqueue two layer uploads, with a delete of both layers in between them. These should be // scheduled one at a time, since deletes can't bypass uploads and vice versa. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), @@ -878,10 +919,22 @@ mod tests { // // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue // and run immediately. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), @@ -916,9 +969,18 @@ mod tests { let tli = make_timeline(); // Enqueue three different layer uploads. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), @@ -981,11 +1043,20 @@ mod tests { // Enqueue three uploads of the current empty index. let index = Box::new(queue.clean.0.clone()); - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index0 = index_with(&index, &layer0); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index1 = index_with(&index0, &layer1); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index2 = index_with(&index1, &layer2); let ops = [ @@ -1045,7 +1116,10 @@ mod tests { let tli = make_timeline(); // Create a layer to upload. - let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index_upload = index_with(&queue.clean.0, &layer); // Remove the layer reference in a new index, then delete the layer. @@ -1090,7 +1164,10 @@ mod tests { let tli = make_timeline(); // Create a layer to upload. - let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); // Upload the layer. Then dereference the layer, and upload/reference it again. let index_upload = index_with(&queue.clean.0, &layer); @@ -1138,10 +1215,22 @@ mod tests { let tli = make_timeline(); let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); // Enqueue non-conflicting upload, delete, and index before and after a shutdown. let ops = [ @@ -1197,10 +1286,22 @@ mod tests { let tli = make_timeline(); // Enqueue a bunch of uploads. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 47fb4a276b..166917d674 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -27,8 +27,7 @@ use utils::vec_map::VecMap; use crate::context::RequestContext; use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; -use crate::virtual_file::IoBufferMut; -use crate::virtual_file::{self, VirtualFile}; +use crate::virtual_file::{self, IoBufferMut, VirtualFile}; /// Metadata bundled with the start and end offset of a blob. #[derive(Copy, Clone, Debug)] @@ -139,7 +138,10 @@ impl VectoredBlob { bits => { let error = std::io::Error::new( std::io::ErrorKind::InvalidData, - format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end), + format!( + "Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", + self.meta.key, self.meta.lsn, self.start, self.end + ), ); Err(error) } @@ -677,13 +679,12 @@ impl StreamingVectoredReadPlanner { mod tests { use anyhow::Error; + use super::super::blob_io::tests::{random_array, write_maybe_compressed}; + use super::*; use crate::context::DownloadBehavior; use crate::page_cache::PAGE_SZ; use crate::task_mgr::TaskKind; - use super::super::blob_io::tests::{random_array, write_maybe_compressed}; - use super::*; - fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64; assert_eq!(read.start % ALIGN, 0); @@ -960,7 +961,8 @@ mod tests { } async fn round_trip_test_compressed(blobs: &[Vec], compression: bool) -> Result<(), Error> { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let (_temp_dir, pathbuf, offsets) = write_maybe_compressed::(blobs, compression, &ctx).await?; diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 093a944777..29d1a31aaf 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -3,13 +3,15 @@ //! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the //! truth. -use anyhow::Context; use std::path::Path; + +use anyhow::Context; +use pageserver_api::models::PageserverUtilization; use utils::serde_percent::Percent; -use pageserver_api::models::PageserverUtilization; - -use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager}; +use crate::config::PageServerConf; +use crate::metrics::NODE_UTILIZATION_SCORE; +use crate::tenant::mgr::TenantManager; pub(crate) fn regenerate( conf: &PageServerConf, diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index c966ad813f..cd3d897423 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -11,11 +11,13 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! -use crate::context::RequestContext; -use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; +use std::fs::File; +use std::io::{Error, ErrorKind, Seek, SeekFrom}; +use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; +#[cfg(target_os = "linux")] +use std::os::unix::fs::OpenOptionsExt; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; -use crate::page_cache::{PageWriteGuard, PAGE_SZ}; -use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer; @@ -23,31 +25,29 @@ use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlig use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut}; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; -use pageserver_api::shard::TenantShardId; -use std::fs::File; -use std::io::{Error, ErrorKind, Seek, SeekFrom}; -#[cfg(target_os = "linux")] -use std::os::unix::fs::OpenOptionsExt; -use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; - -use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; +pub use pageserver_api::models::virtual_file as api; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; +use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; -pub use pageserver_api::models::virtual_file as api; +use crate::assert_u64_eq_usize::UsizeIsU64; +use crate::context::RequestContext; +use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation}; +use crate::page_cache::{PAGE_SZ, PageWriteGuard}; pub(crate) mod io_engine; -pub use io_engine::feature_test as io_engine_feature_test; -pub use io_engine::io_engine_for_bench; -pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; +pub use io_engine::{ + FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test, + io_engine_for_bench, +}; mod metadata; mod open_options; -use self::owned_buffers_io::write::OwnedAsyncWriter; pub(crate) use api::IoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; +use self::owned_buffers_io::write::OwnedAsyncWriter; + pub(crate) mod owned_buffers_io { //! Abstractions for IO with owned buffers. //! @@ -120,7 +120,7 @@ impl VirtualFile { pub async fn open_with_options>( path: P, open_options: &OpenOptions, - ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ + ctx: &RequestContext, ) -> Result { let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; Ok(VirtualFile { @@ -132,7 +132,7 @@ impl VirtualFile { pub async fn open_with_options_v2>( path: P, open_options: &OpenOptions, - ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ + ctx: &RequestContext, ) -> Result { let file = match get_io_mode() { IoMode::Buffered => { @@ -303,13 +303,6 @@ pub struct VirtualFileInner { /// storing it here. pub path: Utf8PathBuf, open_options: OpenOptions, - - // These are strings becase we only use them for metrics, and those expect strings. - // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into - // strings. - tenant_id: String, - shard_id: String, - timeline_id: String, } #[derive(Debug, PartialEq, Clone, Copy)] @@ -591,36 +584,16 @@ impl VirtualFileInner { pub async fn open_with_options>( path: P, open_options: &OpenOptions, - _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ + _ctx: &RequestContext, ) -> Result { - let path_ref = path.as_ref(); - let path_str = path_ref.to_string(); - let parts = path_str.split('/').collect::>(); - let (tenant_id, shard_id, timeline_id) = - if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME { - let tenant_shard_part = parts[parts.len() - 4]; - let (tenant_id, shard_id) = match tenant_shard_part.parse::() { - Ok(tenant_shard_id) => ( - tenant_shard_id.tenant_id.to_string(), - format!("{}", tenant_shard_id.shard_slug()), - ), - Err(_) => { - // Malformed path: this ID is just for observability, so tolerate it - // and pass through - (tenant_shard_part.to_string(), "*".to_string()) - } - }; - (tenant_id, shard_id, parts[parts.len() - 2].to_string()) - } else { - ("*".to_string(), "*".to_string(), "*".to_string()) - }; + let path = path.as_ref(); let (handle, mut slot_guard) = get_open_files().find_victim_slot().await; // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case // where our caller doesn't get to use the returned VirtualFile before its // slot gets re-used by someone else. let file = observe_duration!(StorageIoOperation::Open, { - open_options.open(path_ref.as_std_path()).await? + open_options.open(path.as_std_path()).await? }); // Strip all options other than read and write. @@ -636,11 +609,8 @@ impl VirtualFileInner { let vfile = VirtualFileInner { handle: RwLock::new(handle), pos: 0, - path: path_ref.to_path_buf(), + path: path.to_owned(), open_options: reopen_options, - tenant_id, - shard_id, - timeline_id, }; // TODO: Under pressure, it's likely the slot will get re-used and @@ -943,7 +913,7 @@ impl VirtualFileInner { &self, buf: tokio_epoll_uring::Slice, offset: u64, - _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ctx: &RequestContext, ) -> (tokio_epoll_uring::Slice, Result) where Buf: tokio_epoll_uring::IoBufMut + Send, @@ -961,14 +931,7 @@ impl VirtualFileInner { let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await; let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at"); if let Ok(size) = res { - STORAGE_IO_SIZE - .with_label_values(&[ - "read", - &self.tenant_id, - &self.shard_id, - &self.timeline_id, - ]) - .add(size as i64); + ctx.io_size_metrics().read.add(size.into_u64()); } (buf, res) }) @@ -979,9 +942,9 @@ impl VirtualFileInner { &self, buf: FullSlice, offset: u64, - _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ctx: &RequestContext, ) -> (FullSlice, Result) { - let (slice, result) = self.write_at_inner(buf, offset, _ctx).await; + let (slice, result) = self.write_at_inner(buf, offset, ctx).await; let result = result.maybe_fatal_err("write_at"); (slice, result) } @@ -990,7 +953,7 @@ impl VirtualFileInner { &self, buf: FullSlice, offset: u64, - _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ctx: &RequestContext, ) -> (FullSlice, Result) { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, @@ -1000,14 +963,7 @@ impl VirtualFileInner { let ((_file_guard, buf), result) = io_engine::get().write_at(file_guard, offset, buf).await; if let Ok(size) = result { - STORAGE_IO_SIZE - .with_label_values(&[ - "write", - &self.tenant_id, - &self.shard_id, - &self.timeline_id, - ]) - .add(size as i64); + ctx.io_size_metrics().write.add(size.into_u64()); } (buf, result) }) @@ -1078,7 +1034,8 @@ where #[cfg(test)] mod test_read_exact_at_impl { - use std::{collections::VecDeque, sync::Arc}; + use std::collections::VecDeque; + use std::sync::Arc; use tokio_epoll_uring::{BoundedBuf, BoundedBufMut}; @@ -1342,9 +1299,8 @@ impl OwnedAsyncWriter for VirtualFile { buf: FullSlice, offset: u64, ctx: &RequestContext, - ) -> std::io::Result> { - let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await; - res.map(|_| buf) + ) -> (FullSlice, std::io::Result<()>) { + VirtualFile::write_all_at(self, buf, offset, ctx).await } } @@ -1424,19 +1380,19 @@ static SYNC_MODE: AtomicU8 = AtomicU8::new(SyncMode::Sync as u8); #[cfg(test)] mod tests { - use crate::context::DownloadBehavior; - use crate::task_mgr::TaskKind; - - use super::*; - use owned_buffers_io::io_buf_ext::IoBufExt; - use owned_buffers_io::slice::SliceMutExt; - use rand::seq::SliceRandom; - use rand::thread_rng; - use rand::Rng; use std::io::Write; use std::os::unix::fs::FileExt; use std::sync::Arc; + use owned_buffers_io::io_buf_ext::IoBufExt; + use owned_buffers_io::slice::SliceMutExt; + use rand::seq::SliceRandom; + use rand::{Rng, thread_rng}; + + use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + enum MaybeVirtualFile { VirtualFile(VirtualFile), File(File), @@ -1591,7 +1547,8 @@ mod tests { where A: Adapter, { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir(testname); std::fs::create_dir_all(&testdir)?; @@ -1718,7 +1675,8 @@ mod tests { const THREADS: usize = 100; const SAMPLE: [u8; SIZE] = [0xADu8; SIZE]; - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency"); std::fs::create_dir_all(&testdir)?; @@ -1777,7 +1735,8 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_basic() { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1805,7 +1764,8 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index ccde90ee1a..758dd6e377 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -80,7 +80,9 @@ pub(crate) fn get() -> IoEngine { Ok(v) => match v.parse::() { Ok(engine_kind) => engine_kind, Err(e) => { - panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}") + panic!( + "invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}" + ) } }, Err(std::env::VarError::NotPresent) => { @@ -107,15 +109,12 @@ pub(crate) fn get() -> IoEngine { } } -use std::{ - os::unix::prelude::FileExt, - sync::atomic::{AtomicU8, Ordering}, -}; +use std::os::unix::prelude::FileExt; +use std::sync::atomic::{AtomicU8, Ordering}; -use super::{ - owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt}, - FileGuard, Metadata, -}; +use super::owned_buffers_io::io_buf_ext::FullSlice; +use super::owned_buffers_io::slice::SliceMutExt; +use super::{FileGuard, Metadata}; #[cfg(target_os = "linux")] fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs index c67215492f..ad17405b64 100644 --- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs +++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs @@ -5,18 +5,16 @@ //! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series. //! See for more details. -use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; - -use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; -use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use tokio_epoll_uring::{System, SystemHandle}; - -use crate::virtual_file::on_fatal_io_error; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, error, info, info_span, warn}; +use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE}; +use crate::virtual_file::on_fatal_io_error; #[derive(Clone)] struct ThreadLocalState(Arc); @@ -194,7 +192,7 @@ impl std::ops::Deref for Handle { fn deref(&self) -> &Self::Target { self.0 - .0 + .0 .cell .get() .expect("must be already initialized when using this") diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 7f951270d1..e188b8649b 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -1,7 +1,9 @@ //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; +use std::os::fd::OwnedFd; +use std::path::Path; + use super::io_engine::IoEngine; -use std::{os::fd::OwnedFd, path::Path}; #[derive(Debug, Clone)] pub enum OpenOptions { diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index a5c26cd746..090d2ece85 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -1,9 +1,9 @@ -use std::{ - ops::{Deref, Range, RangeBounds}, - sync::Arc, -}; +use std::ops::{Deref, Range, RangeBounds}; +use std::sync::Arc; -use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign}; +use super::alignment::Alignment; +use super::raw::RawAlignedBuffer; +use super::{AlignedBufferMut, ConstAlign}; /// An shared, immutable aligned buffer type. #[derive(Clone, Debug)] diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index d2f5e206bb..df5c911e50 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -1,13 +1,9 @@ -use std::{ - mem::MaybeUninit, - ops::{Deref, DerefMut}, -}; +use std::mem::MaybeUninit; +use std::ops::{Deref, DerefMut}; -use super::{ - alignment::{Alignment, ConstAlign}, - buffer::AlignedBuffer, - raw::RawAlignedBuffer, -}; +use super::alignment::{Alignment, ConstAlign}; +use super::buffer::AlignedBuffer; +use super::raw::RawAlignedBuffer; /// A mutable aligned buffer type. #[derive(Debug)] @@ -75,7 +71,8 @@ impl AlignedBufferMut { /// Force the length of the buffer to `new_len`. #[inline] unsafe fn set_len(&mut self, new_len: usize) { - self.raw.set_len(new_len) + // SAFETY: the caller is unsafe + unsafe { self.raw.set_len(new_len) } } #[inline] @@ -222,8 +219,10 @@ unsafe impl bytes::BufMut for AlignedBufferMut { panic_advance(cnt, remaining); } - // Addition will not overflow since the sum is at most the capacity. - self.set_len(len + cnt); + // SAFETY: Addition will not overflow since the sum is at most the capacity. + unsafe { + self.set_len(len + cnt); + } } #[inline] @@ -275,7 +274,10 @@ unsafe impl tokio_epoll_uring::IoBufMut for AlignedBufferMut { unsafe fn set_init(&mut self, init_len: usize) { if self.len() < init_len { - self.set_len(init_len); + // SAFETY: caller function is unsafe + unsafe { + self.set_len(init_len); + } } } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs index 6c26dec0db..97a6c4049a 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs @@ -1,9 +1,7 @@ use core::slice; -use std::{ - alloc::{self, Layout}, - cmp, - mem::ManuallyDrop, -}; +use std::alloc::{self, Layout}; +use std::cmp; +use std::mem::ManuallyDrop; use super::alignment::{Alignment, ConstAlign}; diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index 525f447b6d..4c671c2652 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -1,11 +1,12 @@ //! See [`FullSlice`]. -use crate::virtual_file::{IoBuffer, IoBufferMut}; -use bytes::{Bytes, BytesMut}; use std::ops::{Deref, Range}; + +use bytes::{Bytes, BytesMut}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; use super::write::CheapCloneForRead; +use crate::virtual_file::{IoBuffer, IoBufferMut}; /// The true owned equivalent for Rust [`slice`]. Use this for the write path. /// diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs index 6100593663..9f4a05dd57 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/slice.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs @@ -1,7 +1,4 @@ -use tokio_epoll_uring::BoundedBuf; -use tokio_epoll_uring::BoundedBufMut; -use tokio_epoll_uring::IoBufMut; -use tokio_epoll_uring::Slice; +use tokio_epoll_uring::{BoundedBuf, BoundedBufMut, IoBufMut, Slice}; pub(crate) trait SliceMutExt { /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO. @@ -35,10 +32,11 @@ where mod tests { use std::io::Read; - use super::*; use bytes::Buf; use tokio_epoll_uring::Slice; + use super::*; + #[test] fn test_slice_full_zeroed() { let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader(); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 7299d83703..a7e06c0a14 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,20 +1,14 @@ mod flush; use std::sync::Arc; +pub(crate) use flush::FlushControl; use flush::FlushHandle; use tokio_epoll_uring::IoBuf; -use crate::{ - context::RequestContext, - virtual_file::{IoBuffer, IoBufferMut}, -}; - -use super::{ - io_buf_aligned::IoBufAligned, - io_buf_ext::{FullSlice, IoBufExt}, -}; - -pub(crate) use flush::FlushControl; +use super::io_buf_aligned::IoBufAligned; +use super::io_buf_ext::{FullSlice, IoBufExt}; +use crate::context::RequestContext; +use crate::virtual_file::{IoBuffer, IoBufferMut}; pub(crate) trait CheapCloneForRead { /// Returns a cheap clone of the buffer. @@ -37,7 +31,7 @@ pub trait OwnedAsyncWriter { buf: FullSlice, offset: u64, ctx: &RequestContext, - ) -> impl std::future::Future>> + Send; + ) -> impl std::future::Future, std::io::Result<()>)> + Send; } /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch @@ -72,6 +66,7 @@ where buf_new: impl Fn() -> B, gate_guard: utils::sync::gate::GateGuard, ctx: &RequestContext, + flush_task_span: tracing::Span, ) -> Self { Self { writer: writer.clone(), @@ -81,6 +76,7 @@ where buf_new(), gate_guard, ctx.attached_child(), + flush_task_span, ), bytes_submitted: 0, } @@ -275,12 +271,12 @@ mod tests { buf: FullSlice, offset: u64, _: &RequestContext, - ) -> std::io::Result> { + ) -> (FullSlice, std::io::Result<()>) { self.writes .lock() .unwrap() .push((Vec::from(&buf[..]), offset)); - Ok(buf) + (buf, Ok(())) } } @@ -299,6 +295,7 @@ mod tests { || IoBufferMut::with_capacity(2), gate.enter()?, ctx, + tracing::Span::none(), ); writer.write_buffered_borrowed(b"abc", ctx).await?; diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 9ce8b311bb..e3cf9be438 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -1,13 +1,16 @@ +use std::ops::ControlFlow; use std::sync::Arc; +use once_cell::sync::Lazy; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, info, info_span, warn}; use utils::sync::duplex; -use crate::{ - context::RequestContext, - virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice}, -}; - use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter}; +use crate::context::RequestContext; +use crate::virtual_file::MaybeFatalIo; +use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAligned; +use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice; /// A handle to the flush task. pub struct FlushHandle { @@ -120,6 +123,7 @@ where buf: B, gate_guard: utils::sync::gate::GateGuard, ctx: RequestContext, + span: tracing::Span, ) -> Self where B: Buffer + Send + 'static, @@ -127,11 +131,14 @@ where // It is fine to buffer up to only 1 message. We only 1 message in-flight at a time. let (front, back) = duplex::mpsc::channel(1); - let join_handle = tokio::spawn(async move { - FlushBackgroundTask::new(back, file, gate_guard, ctx) - .run(buf.flush()) - .await - }); + let join_handle = tokio::spawn( + async move { + FlushBackgroundTask::new(back, file, gate_guard, ctx) + .run(buf.flush()) + .await + } + .instrument(span), + ); FlushHandle { inner: Some(FlushHandleInner { @@ -238,6 +245,7 @@ where /// The passed in slice is immediately sent back to the flush handle through the duplex channel. async fn run(mut self, slice: FullSlice) -> std::io::Result> { // Sends the extra buffer back to the handle. + // TODO: can this ever await and or fail? I think not. self.channel.send(slice).await.map_err(|_| { std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early") })?; @@ -253,10 +261,47 @@ where } // Write slice to disk at `offset`. - let slice = self - .writer - .write_all_at(request.slice, request.offset, &self.ctx) - .await?; + // + // Error handling happens according to the current policy of crashing + // on fatal IO errors and retrying in place otherwise (deeming all other errors retryable). + // (The upper layers of the Pageserver write path are not equipped to retry write errors + // becasuse they often deallocate the buffers that were already written). + // + // TODO: cancellation sensitiity. + // Without it, if we hit a bug where retrying is never successful, + // then we can't shut down the timeline/tenant/pageserver cleanly because + // layers of the Pageserver write path are holding the gate open for EphemeralFile. + // + // TODO: use utils::backoff::retry once async closures are actually usable + // + let mut slice_storage = Some(request.slice); + for attempt in 1.. { + let result = async { + if attempt > 1 { + info!("retrying flush"); + } + let slice = slice_storage.take().expect( + "likely previous invocation of this future didn't get polled to completion", + ); + let (slice, res) = self.writer.write_all_at(slice, request.offset, &self.ctx).await; + slice_storage = Some(slice); + let res = res.maybe_fatal_err("owned_buffers_io flush"); + let Err(err) = res else { + return ControlFlow::Break(()); + }; + warn!(%err, "error flushing buffered writer buffer to disk, retrying after backoff"); + static NO_CANCELLATION: Lazy = Lazy::new(CancellationToken::new); + utils::backoff::exponential_backoff(attempt, 1.0, 10.0, &NO_CANCELLATION).await; + ControlFlow::Continue(()) + } + .instrument(info_span!("flush_attempt", %attempt)) + .await; + match result { + ControlFlow::Break(()) => break, + ControlFlow::Continue(()) => continue, + } + } + let slice = slice_storage.expect("loop must have run at least once"); #[cfg(test)] { diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 45c87353a7..18df065f76 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -22,39 +22,35 @@ //! bespoken Rust code. use std::collections::HashMap; -use std::sync::Arc; -use std::sync::OnceLock; -use std::time::Duration; -use std::time::Instant; -use std::time::SystemTime; +use std::sync::{Arc, OnceLock}; +use std::time::{Duration, Instant, SystemTime}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use bytes::{Buf, Bytes}; -use tracing::*; - -use crate::context::RequestContext; -use crate::metrics::WAL_INGEST; -use crate::pgdatadir_mapping::{DatadirModification, Version}; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::PageReconstructError; -use crate::tenant::Timeline; -use crate::ZERO_PAGE; use pageserver_api::key::rel_block_to_key; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::fsm_logical_to_physical; -use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::walrecord::*; -use postgres_ffi::TransactionId; -use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz}; +use postgres_ffi::{ + TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, + fsm_logical_to_physical, pg_constants, +}; +use tracing::*; use utils::bin_ser::SerializeError; use utils::lsn::Lsn; use utils::rate_limit::RateLimit; use utils::{critical, failpoint_support}; use wal_decoder::models::*; +use crate::ZERO_PAGE; +use crate::context::RequestContext; +use crate::metrics::WAL_INGEST; +use crate::pgdatadir_mapping::{DatadirModification, Version}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::{PageReconstructError, Timeline}; + enum_pgversion! {CheckPoint, pgv::CheckPoint} impl CheckPoint { @@ -302,7 +298,9 @@ impl WalIngest { if xid > next_xid { // Wraparound occurred, must be from a prev epoch. if epoch == 0 { - bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"); + bail!( + "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}" + ); } epoch -= 1; } @@ -796,9 +794,7 @@ impl WalIngest { // Remove twophase file. see RemoveTwoPhaseFile() in postgres code trace!( "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", - xl_xid, - parsed.xid, - lsn, + xl_xid, parsed.xid, lsn, ); let xid: u64 = if modification.tline.pg_version >= 17 { @@ -1130,16 +1126,14 @@ impl WalIngest { let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", - xlog_checkpoint.oldestXid, - cp.oldestXid + xlog_checkpoint.oldestXid, cp.oldestXid ); if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 { cp.oldestXid = xlog_checkpoint.oldestXid; } trace!( "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", - xlog_checkpoint.oldestActiveXid, - cp.oldestActiveXid + xlog_checkpoint.oldestActiveXid, cp.oldestActiveXid ); // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, @@ -1368,8 +1362,9 @@ impl WalIngest { // with zero pages. Logging is rate limited per pg version to // avoid skewing. if gap_blocks_filled > 0 { - use once_cell::sync::Lazy; use std::sync::Mutex; + + use once_cell::sync::Lazy; use utils::rate_limit::RateLimit; struct RateLimitPerPgVersion { @@ -1475,10 +1470,7 @@ impl WalIngest { if new_nblocks > old_nblocks { trace!( "extending SLRU {:?} seg {} from {} to {} blocks", - kind, - segno, - old_nblocks, - new_nblocks + kind, segno, old_nblocks, new_nblocks ); modification.put_slru_extend(kind, segno, new_nblocks)?; @@ -1517,13 +1509,13 @@ async fn get_relsize( #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::*; - use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; - use crate::tenant::storage_layer::IoConcurrency; use postgres_ffi::RELSEG_SIZE; + use super::*; use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::*; + use crate::tenant::remote_timeline_client::{INITDB_PATH, remote_initdb_archive_path}; + use crate::tenant::storage_layer::IoConcurrency; /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { @@ -1606,10 +1598,12 @@ mod tests { .await?, false ); - assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) - .await - .is_err()); + assert!( + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .await + .is_err() + ); assert_eq!( tline .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) @@ -1997,10 +1991,12 @@ mod tests { .await?, false ); - assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) - .await - .is_err()); + assert!( + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .await + .is_err() + ); assert_eq!( tline @@ -2230,9 +2226,10 @@ mod tests { /// without waiting for unrelated steps. #[tokio::test] async fn test_ingest_real_wal() { - use crate::tenant::harness::*; - use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::WAL_SEGMENT_SIZE; + use postgres_ffi::waldecoder::WalStreamDecoder; + + use crate::tenant::harness::*; // Define test data path and constants. // diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 027a6eb7d7..22d8d83811 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -24,26 +24,27 @@ mod process; /// Code to apply [`NeonWalRecord`]s. pub(crate) mod apply_neon; -use crate::config::PageServerConf; -use crate::metrics::{ - WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, - WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, -}; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::key::Key; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; -use std::future::Future; -use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; use tracing::*; use utils::lsn::Lsn; use utils::sync::gate::GateError; use utils::sync::heavier_once_cell; +use crate::config::PageServerConf; +use crate::metrics::{ + WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, + WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, +}; + /// The real implementation that uses a Postgres process to /// perform WAL replay. /// @@ -547,15 +548,18 @@ impl PostgresRedoManager { #[cfg(test)] mod tests { - use super::PostgresRedoManager; - use crate::config::PageServerConf; + use std::str::FromStr; + use bytes::Bytes; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; - use std::str::FromStr; use tracing::Instrument; - use utils::{id::TenantId, lsn::Lsn}; + use utils::id::TenantId; + use utils::lsn::Lsn; + + use super::PostgresRedoManager; + use crate::config::PageServerConf; #[tokio::test] async fn test_ping() { diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index d62e325310..61ae1eb970 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -4,13 +4,12 @@ use bytes::BytesMut; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::SlruKind; -use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, }; -use postgres_ffi::BLCKSZ; +use postgres_ffi::{BLCKSZ, pg_constants}; use tracing::*; use utils::lsn::Lsn; diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index bf30b92ea5..6d4a38d4ff 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -2,28 +2,28 @@ mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - page_cache::PAGE_SZ, - span::debug_assert_current_span_has_tenant_id, -}; +use std::collections::VecDeque; +use std::process::{Command, Stdio}; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::time::Duration; + use anyhow::Context; use bytes::Bytes; use pageserver_api::record::NeonWalRecord; -use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use pageserver_api::reltag::RelTag; +use pageserver_api::shard::TenantShardId; use postgres_ffi::BLCKSZ; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - process::{Command, Stdio}, - time::Duration, -}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, poison::Poison}; +use tracing::{Instrument, debug, error, instrument}; +use utils::lsn::Lsn; +use utils::poison::Poison; + +use self::no_leak_child::NoLeakChild; +use crate::config::PageServerConf; +use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER, WalRedoKillCause}; +use crate::page_cache::PAGE_SZ; +use crate::span::debug_assert_current_span_has_tenant_id; pub struct WalRedoProcess { #[allow(dead_code)] @@ -136,7 +136,9 @@ impl WalRedoProcess { Ok(0) => break Ok(()), // eof Ok(num_bytes) => { let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); + if !output.contains("LOG:") { + error!(%output, "received output"); + } } Err(e) => { break Err(e); diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs index 1a0d7039df..9939fc4b36 100644 --- a/pageserver/src/walredo/process/no_leak_child.rs +++ b/pageserver/src/walredo/process/no_leak_child.rs @@ -1,19 +1,11 @@ -use tracing::instrument; -use tracing::{error, info}; - -use crate::metrics::WalRedoKillCause; -use crate::metrics::WAL_REDO_PROCESS_COUNTERS; - use std::io; -use std::process::Command; - -use std::ops::DerefMut; - -use std::ops::Deref; - -use std::process::Child; +use std::ops::{Deref, DerefMut}; +use std::process::{Child, Command}; use pageserver_api::shard::TenantShardId; +use tracing::{error, info, instrument}; + +use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WalRedoKillCause}; /// Wrapper type around `std::process::Child` which guarantees that the child /// will be killed and waited-for by this process before being dropped. diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index f6a577abfc..f13522e55b 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -1195,9 +1195,11 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, state = GET_STATE(entry, chunk_offs + i); if (state == PENDING) { SET_STATE(entry, chunk_offs + i, REQUESTED); - } else if (state != REQUESTED) { + } else if (state == UNAVAILABLE) { SET_STATE(entry, chunk_offs + i, PENDING); break; + } else if (state == AVAILABLE) { + break; } if (!sleeping) { @@ -1369,6 +1371,10 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) if (lfc_ctl) value = lfc_ctl->limit; break; + case 8: + key = "file_cache_chunk_size_pages"; + value = BLOCKS_PER_CHUNK; + break; default: SRF_RETURN_DONE(funcctx); } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index f5801b379b..637281fe4a 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -14,6 +14,10 @@ */ #include "postgres.h" +#include + +#include "libpq-int.h" + #include "access/xlog.h" #include "common/hashfn.h" #include "fmgr.h" @@ -61,6 +65,9 @@ int neon_protocol_version = 2; static int max_reconnect_attempts = 60; static int stripe_size; +static int pageserver_response_log_timeout = 10000; +static int pageserver_response_disconnect_timeout = 120000; /* 2 minutes */ + typedef struct { char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE]; @@ -129,6 +136,11 @@ typedef struct uint64 nrequests_sent; uint64 nresponses_received; + /* State for the receive timeout mechanism in call_PQgetCopyData() */ + instr_time receive_start_time; /* when we started waiting */ + instr_time receive_last_log_time; /* when we last printed a log message for the wait */ + bool receive_logged; /* has the wait been logged */ + /*--- * WaitEventSet containing: * - WL_SOCKET_READABLE on 'conn' @@ -661,6 +673,9 @@ pageserver_connect(shardno_t shard_no, int elevel) shard->state = PS_Connected; shard->nrequests_sent = 0; shard->nresponses_received = 0; + INSTR_TIME_SET_ZERO(shard->receive_start_time); + INSTR_TIME_SET_ZERO(shard->receive_last_log_time); + shard->receive_logged = false; } /* FALLTHROUGH */ case PS_Connected: @@ -680,6 +695,33 @@ pageserver_connect(shardno_t shard_no, int elevel) Assert(false); } +static void +get_socket_stats(int socketfd, int *sndbuf, int *recvbuf) +{ + *sndbuf = -1; + *recvbuf = -1; + +#ifdef __linux__ + /* + * get kernel's send and recv queue size via ioctl + * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27 + */ + if (socketfd != -1) + { + int ioctl_err; + + ioctl_err = ioctl(socketfd, SIOCOUTQ, sndbuf); + if (ioctl_err!= 0) { + *sndbuf = -errno; + } + ioctl_err = ioctl(socketfd, FIONREAD, recvbuf); + if (ioctl_err != 0) { + *recvbuf = -errno; + } + } +#endif +} + /* * A wrapper around PQgetCopyData that checks for interrupts while sleeping. */ @@ -690,26 +732,8 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer) PageServer *shard = &page_servers[shard_no]; PGconn *pageserver_conn = shard->conn; instr_time now, - start_ts, since_start, - last_log_ts, since_last_log; - bool logged = false; - - /* - * As a debugging aid, if we don't get a response for a long time, print a - * log message. - * - * 10 s is a very generous threshold, normally we expect a response in a - * few milliseconds. We have metrics to track latencies in normal ranges, - * but in the cases that take exceptionally long, it's useful to log the - * exact timestamps. - */ -#define LOG_INTERVAL_MS INT64CONST(10 * 1000) - - INSTR_TIME_SET_CURRENT(now); - start_ts = last_log_ts = now; - INSTR_TIME_SET_ZERO(since_last_log); retry: ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); @@ -718,11 +742,36 @@ retry: { WaitEvent occurred_event; int noccurred; + double log_timeout, + disconnect_timeout; long timeout; - timeout = Max(0, LOG_INTERVAL_MS - INSTR_TIME_GET_MILLISEC(since_last_log)); + /* + * Calculate time elapsed since the start, and since the last progress + * log message. On first call, remember the start time. + */ + INSTR_TIME_SET_CURRENT(now); + if (INSTR_TIME_IS_ZERO(shard->receive_start_time)) + { + shard->receive_start_time = now; + INSTR_TIME_SET_ZERO(since_start); + shard->receive_last_log_time = now; + INSTR_TIME_SET_ZERO(since_last_log); + shard->receive_logged = false; + } + else + { + since_start = now; + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + since_last_log = now; + INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time); + } + + /* Sleep until the log or disconnect timeout is reached. */ + log_timeout = Max(0, (double) pageserver_response_log_timeout - INSTR_TIME_GET_MILLISEC(since_last_log)); + disconnect_timeout = Max(0, (double) pageserver_response_disconnect_timeout - INSTR_TIME_GET_MILLISEC(since_start)); + timeout = (long) ceil(Min(log_timeout, disconnect_timeout)); - /* Sleep until there's something to do */ noccurred = WaitEventSetWait(shard->wes_read, timeout, &occurred_event, 1, WAIT_EVENT_NEON_PS_READ); ResetLatch(MyLatch); @@ -740,49 +789,62 @@ retry: pfree(msg); return -1; } + goto retry; + } + + /* Timeout was reached, or we were interrupted for some other reason */ + INSTR_TIME_SET_CURRENT(now); + since_last_log = now; + INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time); + since_start = now; + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + + /* + * As a debugging aid, if we don't get a response to a pageserver request + * for a long time, print a log message. + * + * The default neon.pageserver_response_log_timeout value, 10 s, is + * very generous. Normally we expect a response in a few + * milliseconds. We have metrics to track latencies in normal ranges, + * but in the cases that take exceptionally long, it's useful to log + * the exact timestamps. + */ + if (INSTR_TIME_GET_MILLISEC(since_last_log) >= pageserver_response_log_timeout) + { + int sndbuf; + int recvbuf; + + get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); + + neon_shard_log(shard_no, LOG, + "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d) (conn start=%d end=%d)", + INSTR_TIME_GET_DOUBLE(since_start), + shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf, + pageserver_conn->inStart, pageserver_conn->inEnd); + shard->receive_last_log_time = now; + shard->receive_logged = true; } /* - * Print a message to the log if a long time has passed with no - * response. + * If an even longer time has passed without receiving a response from + * the pageserver, disconnect. That triggers a reconnection attempt + * in the caller. + * + * If this happens, the pageserver is likely dead and isn't coming + * back, or there's some kind of a network glitch and the connection + * is permanently gone. Without this, if the pageserver or the network + * connection is dead, it could take a very long time (15 minutes or + * more) until the TCP keepalive timeout notices that. Even if we + * would in fact get a response if we just waited a little longer, + * there's a good chance that we'll get the response sooner by + * reconnecting. */ - INSTR_TIME_SET_CURRENT(now); - since_last_log = now; - INSTR_TIME_SUBTRACT(since_last_log, last_log_ts); - if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS) + if (INSTR_TIME_GET_MILLISEC(since_start) >= pageserver_response_disconnect_timeout) { - int sndbuf = -1; - int recvbuf = -1; -#ifdef __linux__ - int socketfd; -#endif - - since_start = now; - INSTR_TIME_SUBTRACT(since_start, start_ts); - -#ifdef __linux__ - /* - * get kernel's send and recv queue size via ioctl - * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27 - */ - socketfd = PQsocket(pageserver_conn); - if (socketfd != -1) { - int ioctl_err; - ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf); - if (ioctl_err!= 0) { - sndbuf = -errno; - } - ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf); - if (ioctl_err != 0) { - recvbuf = -errno; - } - } -#endif - neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)", - INSTR_TIME_GET_DOUBLE(since_start), - shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf); - last_log_ts = now; - logged = true; + neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting", + INSTR_TIME_GET_DOUBLE(since_start)); + pageserver_disconnect(shard_no); + return -1; } goto retry; @@ -792,14 +854,18 @@ retry: * If we logged earlier that the response is taking a long time, log * another message when the response is finally received. */ - if (logged) + if (shard->receive_logged) { INSTR_TIME_SET_CURRENT(now); since_start = now; - INSTR_TIME_SUBTRACT(since_start, start_ts); - neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s", + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + neon_shard_log(shard_no, LOG, + "received response from pageserver after %0.3f s", INSTR_TIME_GET_DOUBLE(since_start)); } + INSTR_TIME_SET_ZERO(shard->receive_start_time); + INSTR_TIME_SET_ZERO(shard->receive_last_log_time); + shard->receive_logged = false; return ret; } @@ -973,9 +1039,17 @@ pageserver_receive(shardno_t shard_no) pfree(msg); } } + else if (rc == -1 && shard->state == PS_Disconnected) + { + /* If the state is 'Disconnected', the disconnection message was already logged */ + resp = NULL; + } else if (rc == -1) { - neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn))); + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", msg); + pfree(msg); pageserver_disconnect(shard_no); resp = NULL; } @@ -1028,6 +1102,10 @@ pageserver_try_receive(shardno_t shard_no) { neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response"); pageserver_disconnect(shard_no); + /* + * Malformed responses from PageServer are a reason to raise + * errors and cancel transactions. + */ PG_RE_THROW(); } PG_END_TRY(); @@ -1051,7 +1129,8 @@ pageserver_try_receive(shardno_t shard_no) char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(shard_no); - neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg); + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: could not read COPY data: %s", msg); + resp = NULL; } else { @@ -1250,6 +1329,16 @@ pg_init_libpagestore(void) PGC_USERSET, 0, /* no flags required */ NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL); + DefineCustomIntVariable("neon.readahead_getpage_pull_timeout", + "readahead response pull timeout", + "Time between active tries to pull data from the " + "PageStream connection when we have pages which " + "were read ahead but not yet received.", + &readahead_getpage_pull_timeout_ms, + 0, 0, 5 * 60 * 1000, + PGC_USERSET, + GUC_UNIT_MS, + NULL, NULL, NULL); DefineCustomIntVariable("neon.protocol_version", "Version of compute<->page server protocol", NULL, @@ -1261,6 +1350,26 @@ pg_init_libpagestore(void) 0, /* no flags required */ NULL, NULL, NULL); + DefineCustomIntVariable("neon.pageserver_response_log_timeout", + "pageserver response log timeout", + "If the pageserver doesn't respond to a request within this timeout, " + "a message is printed to the log.", + &pageserver_response_log_timeout, + 10000, 100, INT_MAX, + PGC_SUSET, + GUC_UNIT_MS, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.pageserver_response_disconnect_timeout", + "pageserver response diconnect timeout", + "If the pageserver doesn't respond to a request within this timeout, " + "disconnect and reconnect.", + &pageserver_response_disconnect_timeout, + 120000, 100, INT_MAX, + PGC_SUSET, + GUC_UNIT_MS, + NULL, NULL, NULL); + relsize_hash_init(); if (page_server != NULL) diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 700a942284..0f226cc9e2 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -12,6 +12,7 @@ #include "fmgr.h" #include "miscadmin.h" +#include "pgstat.h" #include "access/subtrans.h" #include "access/twophase.h" #include "access/xlog.h" @@ -410,6 +411,16 @@ ReportSearchPath(void) } } +#if PG_VERSION_NUM < 150000 +/* + * PG14 uses separate backend for stats collector having no access to shared memory. + * As far as AUX mechanism requires access to shared memory, persisting pgstat.stat file + * is not supported in PG14. And so there is no definition of neon_pgstat_file_size_limit + * variable, so we have to declare it here. + */ +static int neon_pgstat_file_size_limit; +#endif + void _PG_init(void) { @@ -426,6 +437,7 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); + pagestore_smgr_init(); Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitUnstableExtensionsSupport(); @@ -445,6 +457,15 @@ _PG_init(void) PGC_SIGHUP, 0, NULL, NULL, NULL); + DefineCustomBoolVariable( + "neon.disable_wal_prevlink_checks", + "Disable validation of prev link in WAL records", + NULL, + &disable_wal_prev_lsn_checks, + false, + PGC_SIGHUP, + 0, + NULL, NULL, NULL); DefineCustomBoolVariable( "neon.allow_replica_misconfig", @@ -467,6 +488,15 @@ _PG_init(void) 0, NULL, NULL, NULL); + DefineCustomIntVariable("neon.pgstat_file_size_limit", + "Maximal size of pgstat.stat file saved in Neon storage", + "Zero value disables persisting pgstat.stat file", + &neon_pgstat_file_size_limit, + 0, 0, 1000000, /* disabled by default */ + PGC_SIGHUP, + GUC_UNIT_KB, + NULL, NULL, NULL); + /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 912e09c3d3..c9beb8c318 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -22,6 +22,8 @@ extern char *neon_tenant; extern char *wal_acceptors_list; extern int wal_acceptor_reconnect_timeout; extern int wal_acceptor_connection_timeout; +extern int readahead_getpage_pull_timeout_ms; +extern bool disable_wal_prev_lsn_checks; #if PG_MAJORVERSION_NUM >= 17 extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; @@ -49,6 +51,7 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); +extern void pagestore_smgr_init(void); extern uint64 BackpressureThrottlingTime(void); extern void SetNeonCurrentClusterSize(uint64 size); diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 1fb4ed9522..1fad44bd58 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -51,6 +51,26 @@ HexDecodeString(uint8 *result, char *input, int nbytes) return true; } +/* -------------------------------- + * pq_getmsgint16 - get a binary 2-byte int from a message buffer + * -------------------------------- + */ +uint16 +pq_getmsgint16(StringInfo msg) +{ + return pq_getmsgint(msg, 2); +} + +/* -------------------------------- + * pq_getmsgint32 - get a binary 4-byte int from a message buffer + * -------------------------------- + */ +uint32 +pq_getmsgint32(StringInfo msg) +{ + return pq_getmsgint(msg, 4); +} + /* -------------------------------- * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order * -------------------------------- diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index 89683714f1..7480ac28cc 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -8,6 +8,8 @@ #endif bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint16 pq_getmsgint16(StringInfo msg); +uint32 pq_getmsgint32(StringInfo msg); uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 9faab1e4f0..475697f9c0 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -209,7 +209,11 @@ typedef struct NeonResponse *(*receive) (shardno_t shard_no); /* * Try get the next response from the TCP buffers, if any. - * Returns NULL when the data is not yet available. + * Returns NULL when the data is not yet available. + * + * This will raise errors only for malformed responses (we can't put them + * back into connection). All other error conditions are soft errors and + * return NULL as "no response available". */ NeonResponse *(*try_receive) (shardno_t shard_no); /* diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 4a79acd777..ae92be4577 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -65,15 +65,21 @@ #include "storage/fsm_internals.h" #include "storage/md.h" #include "storage/smgr.h" +#include "utils/timeout.h" +#include "bitmap.h" +#include "neon.h" #include "neon_perf_counters.h" #include "pagestore_client.h" -#include "bitmap.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" #endif +#if PG_VERSION_NUM < 160000 +typedef PGAlignedBlock PGIOAlignedBlock; +#endif + /* * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API * calls to md.c, and *also* do the calls to the Page Server. On every @@ -123,6 +129,45 @@ static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); static uint32 local_request_counter; #define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) +/* + * Various settings related to prompt (fast) handling of PageStream responses + * at any CHECK_FOR_INTERRUPTS point. + */ +int readahead_getpage_pull_timeout_ms = 0; +static int PS_TIMEOUT_ID = 0; +static bool timeout_set = false; +static bool timeout_signaled = false; + +/* + * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want + * that to handle any getpage responses if we're already working on the + * backlog of those, as we'd hit issues with determining which prefetch slot + * we just got a response for. + * + * To protect against that, we have this variable that's set whenever we start + * receiving data for prefetch slots, so that we don't get confused. + * + * Note that in certain error cases during readpage we may leak r_r_g=true, + * which results in a failure to pick up further responses until we first + * actively try to receive new getpage responses. + */ +static bool readpage_reentrant_guard = false; + +static void reconfigure_timeout_if_needed(void); +static void pagestore_timeout_handler(void); + +#define START_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = true; \ + } while (false) + +#define END_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = false; \ + if (unlikely(timeout_signaled && !InterruptPending)) \ + InterruptPending = true; \ + } while (false) + /* * Prefetch implementation: * @@ -221,7 +266,6 @@ typedef struct PrfHashEntry #define SH_DEFINE #define SH_DECLARE #include "lib/simplehash.h" -#include "neon.h" /* * PrefetchState maintains the state of (prefetch) getPage@LSN requests. @@ -407,17 +451,26 @@ compact_prefetch_buffers(void) } /* - * If there might be responses still in the TCP buffer, then - * we should try to use those, so as to reduce any TCP backpressure - * on the OS/PS side. + * If there might be responses still in the TCP buffer, then we should try to + * use those, to reduce any TCP backpressure on the OS/PS side. * * This procedure handles that. * - * Note that this is only valid as long as the only pipelined - * operations in the TCP buffer are getPage@Lsn requests. + * Note that this works because we don't pipeline non-getPage requests. + * + * NOTE: This procedure is not allowed to throw errors that should be handled + * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS + * point inside and outside PostgreSQL. + * + * This still does throw errors when it receives malformed responses from PS. + * + * When we're not called from CHECK_FOR_INTERRUPTS (indicated by + * IsHandlingInterrupts) we also report we've ended prefetch receive work, + * just in case state tracking was lost due to an error in the sync getPage + * response code. */ static void -prefetch_pump_state(void) +prefetch_pump_state(bool IsHandlingInterrupts) { while (MyPState->ring_receive != MyPState->ring_flush) { @@ -466,6 +519,12 @@ prefetch_pump_state(void) } } } + + /* We never pump the prefetch state while handling other pages */ + if (!IsHandlingInterrupts) + END_PREFETCH_RECEIVE_WORK(); + + reconfigure_timeout_if_needed(); } void @@ -581,8 +640,8 @@ readahead_buffer_resize(int newsize, void *extra) /* * Make sure that there are no responses still in the buffer. * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. + * This function may indirectly update MyPState->pfs_hash; which invalidates + * any active pointers into the hash table. */ static void consume_prefetch_responses(void) @@ -639,6 +698,7 @@ static bool prefetch_wait_for(uint64 ring_index) { PrefetchRequest *entry; + bool result = true; if (MyPState->ring_flush <= ring_index && MyPState->ring_unused > MyPState->ring_flush) @@ -652,13 +712,21 @@ prefetch_wait_for(uint64 ring_index) while (MyPState->ring_receive <= ring_index) { + START_PREFETCH_RECEIVE_WORK(); entry = GetPrfSlot(MyPState->ring_receive); Assert(entry->status == PRFS_REQUESTED); if (!prefetch_read(entry)) - return false; + { + result = false; + break; + } + + END_PREFETCH_RECEIVE_WORK(); + CHECK_FOR_INTERRUPTS(); } - return true; + + return result; } /* @@ -962,11 +1030,25 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n if (!neon_prefetch_response_usable(&lsns[i], slot)) continue; + /* + * Ignore errors + */ + if (slot->response->tag != T_NeonGetPageResponse) + { + if (slot->response->tag != T_NeonErrorResponse) + { + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); + } + continue; + } memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); prefetch_set_unused(ring_index); BITMAP_SET(mask, i); hits += 1; + inc_getpage_wait(0); } } pgBufferUsage.prefetch.hits += hits; @@ -1315,6 +1397,12 @@ page_server_request(void const *req) page_server->disconnect(shard_no); MyNeonCounters->pageserver_open_requests = 0; + /* + * We know for sure we're not working on any prefetch pages after + * this. + */ + END_PREFETCH_RECEIVE_WORK(); + PG_RE_THROW(); } PG_END_TRY(); @@ -1719,7 +1807,7 @@ static XLogRecPtr log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std) { - PGAlignedBlock copied_buffer; + PGIOAlignedBlock copied_buffer; memcpy(copied_buffer.data, page, BLCKSZ); return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); @@ -1736,7 +1824,7 @@ static XLogRecPtr log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, BlockNumber nblocks, Page *pages, bool page_std) { - PGAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; + PGIOAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; BlockNumber blknos[XLR_MAX_BLOCK_ID]; Page pageptrs[XLR_MAX_BLOCK_ID]; int nregistered = 0; @@ -1774,7 +1862,7 @@ log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, static bool PageIsEmptyHeapPage(char *buffer) { - PGAlignedBlock empty_page; + PGIOAlignedBlock empty_page; PageInit((Page) empty_page.data, BLCKSZ, 0); @@ -2763,7 +2851,7 @@ static void neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, int nblocks, bool skipFsync) { - const PGAlignedBlock buffer = {0}; + const PGIOAlignedBlock buffer = {0}; int remblocks = nblocks; XLogRecPtr lsn = 0; @@ -2810,6 +2898,11 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, relpath(reln->smgr_rlocator, forkNum), InvalidBlockNumber))); +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); +#endif + /* Don't log any pages if we're not allowed to do so. */ if (!XLogInsertAllowed()) return; @@ -2942,7 +3035,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, MyPState->ring_last <= ring_index); } - prefetch_pump_state(); + prefetch_pump_state(false); return false; } @@ -2985,7 +3078,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) Assert(ring_index < MyPState->ring_unused && MyPState->ring_last <= ring_index); - prefetch_pump_state(); + prefetch_pump_state(false); return false; } @@ -3029,7 +3122,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3277,7 +3370,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } /* Try to read PS results if they are available */ - prefetch_pump_state(); + prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); @@ -3299,21 +3392,22 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; - char mdbuf[BLCKSZ]; - char mdbuf_masked[BLCKSZ]; + PGIOAlignedBlock mdbuf; + PGIOAlignedBlock mdbuf_masked; + XLogRecPtr request_lsn = request_lsns.request_lsn; - mdread(reln, forkNum, blkno, mdbuf); + mdread(reln, forkNum, blkno, mdbuf.data); memcpy(pageserver_masked, buffer, BLCKSZ); - memcpy(mdbuf_masked, mdbuf, BLCKSZ); + memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); - if (PageIsNew((Page) mdbuf)) + if (PageIsNew((Page) mdbuf.data)) { if (!PageIsNew((Page) pageserver_masked)) { @@ -3332,41 +3426,41 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf)); + hexdump_page(mdbuf.data)); } - else if (PageGetSpecialSize(mdbuf) == 0) + else if (PageGetSpecialSize(mdbuf.data) == 0) { /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } - else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) { /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } @@ -3410,7 +3504,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ - prefetch_pump_state(); + prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); @@ -3455,80 +3549,88 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL - if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; - char mdbuf[BLCKSZ]; - char mdbuf_masked[BLCKSZ]; + PGIOAlignedBlock mdbuf; + PGIOAlignedBlock mdbuf_masked; + XLogRecPtr request_lsn = request_lsns->request_lsn; for (int i = 0; i < nblocks; i++) { + BlockNumber blkno = blocknum + i; + if (!BITMAP_ISSET(read, i)) + continue; + #if PG_MAJORVERSION_NUM >= 17 - mdreadv(reln, forkNum, blkno + i, &mdbuf, 1); + { + void* mdbuffers[1] = { mdbuf.data }; + mdreadv(reln, forknum, blkno, mdbuffers, 1); + } #else - mdread(reln, forkNum, blkno + i, mdbuf); + mdread(reln, forknum, blkno, mdbuf.data); #endif - memcpy(pageserver_masked, buffer, BLCKSZ); - memcpy(mdbuf_masked, mdbuf, BLCKSZ); + memcpy(pageserver_masked, buffers[i], BLCKSZ); + memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); - if (PageIsNew((Page) mdbuf)) + if (PageIsNew((Page) mdbuf.data)) { if (!PageIsNew((Page) pageserver_masked)) { neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(buffer)); + hexdump_page(buffers[i])); } } - else if (PageIsNew((Page) buffer)) + else if (PageIsNew((Page) buffers[i])) { neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf)); + hexdump_page(mdbuf.data)); } - else if (PageGetSpecialSize(mdbuf) == 0) + else if (PageGetSpecialSize(mdbuf.data) == 0) { /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } - else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) { /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } @@ -3580,6 +3682,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo switch (reln->smgr_relpersistence) { case 0: +#ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) { @@ -3598,6 +3701,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo */ return; } +#endif break; case RELPERSISTENCE_PERMANENT: @@ -3625,7 +3729,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3648,6 +3752,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, switch (reln->smgr_relpersistence) { case 0: +#ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) { @@ -3663,6 +3768,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, */ return; } +#endif break; case RELPERSISTENCE_PERMANENT: @@ -3680,11 +3786,11 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) - mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); #endif } @@ -3971,7 +4077,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -4070,8 +4176,10 @@ neon_start_unlogged_build(SMgrRelation reln) * FIXME: should we pass isRedo true to create the tablespace dir if it * doesn't exist? Is it needed? */ - if (!IsParallelWorker()) +#ifndef DEBUG_COMPARE_LOCAL + if (!IsParallelWorker()) mdcreate(reln, MAIN_FORKNUM, false); +#endif } /* @@ -4146,8 +4254,10 @@ neon_end_unlogged_build(SMgrRelation reln) forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); mdclose(reln, forknum); +#ifndef DEBUG_COMPARE_LOCAL /* use isRedo == true, so that we drop it immediately */ mdunlink(rinfob, forknum, true); +#endif } } @@ -4272,6 +4382,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf } pfree(resp); + reconfigure_timeout_if_needed(); return n_blocks; } @@ -4307,6 +4418,7 @@ AtEOXact_neon(XactEvent event, void *arg) } break; } + reconfigure_timeout_if_needed(); } static const struct f_smgr neon_smgr = @@ -4563,3 +4675,94 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) } return no_redo_needed; } + +static void +reconfigure_timeout_if_needed(void) +{ + bool needs_set = MyPState->ring_receive != MyPState->ring_unused && + readahead_getpage_pull_timeout_ms > 0; + + if (needs_set != timeout_set) + { + /* The background writer doens't (shouldn't) read any pages */ + Assert(!AmBackgroundWriterProcess()); + /* The checkpointer doens't (shouldn't) read any pages */ + Assert(!AmCheckpointerProcess()); + + if (unlikely(PS_TIMEOUT_ID == 0)) + { + PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler); + } + + if (needs_set) + { +#if PG_MAJORVERSION_NUM <= 14 + enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms); +#else + enable_timeout_every( + PS_TIMEOUT_ID, + TimestampTzPlusMilliseconds(GetCurrentTimestamp(), + readahead_getpage_pull_timeout_ms), + readahead_getpage_pull_timeout_ms + ); +#endif + timeout_set = true; + } + else + { + Assert(timeout_set); + disable_timeout(PS_TIMEOUT_ID, false); + timeout_set = false; + } + } +} + +static void +pagestore_timeout_handler(void) +{ +#if PG_MAJORVERSION_NUM <= 14 + /* + * PG14: Setting a repeating timeout is not possible, so we signal here + * that the timeout has already been reset, and by telling the system + * that system will re-schedule it later if we need to. + */ + timeout_set = false; +#endif + timeout_signaled = true; + InterruptPending = true; +} + +static process_interrupts_callback_t prev_interrupt_cb; + +/* + * Process new data received in our active PageStream sockets. + * + * This relies on the invariant that all pipelined yet-to-be-received requests + * are getPage requests managed by MyPState. This is currently true, any + * modification will probably require some stuff to make it work again. + */ +static bool +pagestore_smgr_processinterrupts(void) +{ + if (timeout_signaled) + { + if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) + prefetch_pump_state(true); + + timeout_signaled = false; + reconfigure_timeout_if_needed(); + } + + if (!prev_interrupt_cb) + return false; + + return prev_interrupt_cb(); +} + + +void +pagestore_smgr_init(void) +{ + prev_interrupt_cb = ProcessInterruptsCallback; + ProcessInterruptsCallback = pagestore_smgr_processinterrupts; +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 7472fd6afc..0336d63e8d 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -57,10 +57,11 @@ static void SendProposerGreeting(Safekeeper *sk); static void RecvAcceptorGreeting(Safekeeper *sk); static void SendVoteRequest(Safekeeper *sk); static void RecvVoteResponse(Safekeeper *sk); +static bool VotesCollected(WalProposer *wp); static void HandleElectedProposer(WalProposer *wp); static term_t GetHighestTerm(TermHistory *th); -static term_t GetEpoch(Safekeeper *sk); -static void DetermineEpochStartLsn(WalProposer *wp); +static term_t GetLastLogTerm(Safekeeper *sk); +static void ProcessPropStartPos(WalProposer *wp); static void SendProposerElected(Safekeeper *sk); static void StartStreaming(Safekeeper *sk); static void SendMessageToNode(Safekeeper *sk); @@ -70,6 +71,7 @@ static bool SendAppendRequests(Safekeeper *sk); static bool RecvAppendResponses(Safekeeper *sk); static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp); +static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version); static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); @@ -81,6 +83,9 @@ static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); static void UpdateDonorShmem(WalProposer *wp); +static char *MembershipConfigurationToString(MembershipConfiguration *mconf); +static void MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst); +static void MembershipConfigurationFree(MembershipConfiguration *mconf); WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) @@ -93,8 +98,34 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp = palloc0(sizeof(WalProposer)); wp->config = config; wp->api = api; + wp->state = WPS_COLLECTING_TERMS; - for (host = wp->config->safekeepers_list; host != NULL && *host != '\0'; host = sep) + wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list); + + /* + * If safekeepers list starts with g# parse generation number followed by + * : + */ + if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0) + { + char *endptr; + + errno = 0; + wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10); + if (errno != 0) + { + wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m"); + } + /* Skip past : to the first hostname. */ + host = endptr + 1; + } + else + { + host = wp->config->safekeepers_list; + } + wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation); + + for (; host != NULL && *host != '\0'; host = sep) { port = strchr(host, ':'); if (port == NULL) @@ -137,25 +168,21 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) } wp->quorum = wp->n_safekeepers / 2 + 1; + if (wp->config->proto_version != 2 && wp->config->proto_version != 3) + wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version); + wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); + /* Fill the greeting package */ - wp->greetRequest.tag = 'g'; - wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION; - wp->greetRequest.pgVersion = PG_VERSION_NUM; - wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId)); - wp->greetRequest.systemId = wp->config->systemId; - if (!wp->config->neon_timeline) - wp_log(FATAL, "neon.timeline_id is not provided"); - if (*wp->config->neon_timeline != '\0' && - !HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16)) - wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline); + wp->greetRequest.pam.tag = 'g'; if (!wp->config->neon_tenant) wp_log(FATAL, "neon.tenant_id is not provided"); - if (*wp->config->neon_tenant != '\0' && - !HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16)) - wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant); - - wp->greetRequest.timeline = wp->config->pgTimeline; - wp->greetRequest.walSegSize = wp->config->wal_segment_size; + wp->greetRequest.tenant_id = wp->config->neon_tenant; + if (!wp->config->neon_timeline) + wp_log(FATAL, "neon.timeline_id is not provided"); + wp->greetRequest.timeline_id = wp->config->neon_timeline; + wp->greetRequest.pg_version = PG_VERSION_NUM; + wp->greetRequest.system_id = wp->config->systemId; + wp->greetRequest.wal_seg_size = wp->config->wal_segment_size; wp->api.init_event_set(wp); @@ -165,12 +192,14 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) void WalProposerFree(WalProposer *wp) { + MembershipConfigurationFree(&wp->mconf); for (int i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; Assert(sk->outbuf.data != NULL); pfree(sk->outbuf.data); + MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; @@ -182,6 +211,12 @@ WalProposerFree(WalProposer *wp) pfree(wp); } +static bool +WalProposerGenerationsEnabled(WalProposer *wp) +{ + return wp->safekeepers_generation != 0; +} + /* * Create new AppendRequest message and start sending it. This function is * called from walsender every time the new WAL is available. @@ -308,6 +343,7 @@ ShutdownConnection(Safekeeper *sk) sk->state = SS_OFFLINE; sk->streamingAt = InvalidXLogRecPtr; + MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; @@ -484,7 +520,7 @@ AdvancePollState(Safekeeper *sk, uint32 events) * nodes are transferred from SS_VOTING to sending actual vote * requests. */ - case SS_VOTING: + case SS_WAIT_VOTING: wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host, sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); @@ -513,7 +549,7 @@ AdvancePollState(Safekeeper *sk, uint32 events) /* * Idle state for waiting votes from quorum. */ - case SS_IDLE: + case SS_WAIT_ELECTED: wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host, sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); @@ -599,10 +635,17 @@ SendStartWALPush(Safekeeper *sk) { WalProposer *wp = sk->wp; - if (!wp->api.conn_send_query(sk, "START_WAL_PUSH")) + /* Forbid implicit timeline creation if generations are enabled. */ + char *allow_timeline_creation = WalProposerGenerationsEnabled(wp) ? "false" : "true"; +#define CMD_LEN 512 + char cmd[CMD_LEN]; + + + snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d', allow_timeline_creation '%s')", wp->config->proto_version, allow_timeline_creation); + if (!wp->api.conn_send_query(sk, cmd)) { - wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s", + cmd, sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; } @@ -658,23 +701,42 @@ RecvStartWALPushResult(Safekeeper *sk) /* * Start handshake: first of all send information about the - * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for + * walproposer. After sending, we wait on SS_HANDSHAKE_RECV for * a response to finish the handshake. */ static void SendProposerGreeting(Safekeeper *sk) { + WalProposer *wp = sk->wp; + char *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf); + + wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml); + pfree(mconf_toml); + + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest, + &sk->outbuf, wp->config->proto_version); + /* * On failure, logging & resetting the connection is handled. We just need * to handle the control flow. */ - BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV); + BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV); +} + +/* + * Have we received greeting from enough (quorum) safekeepers to start voting? + */ +static bool +TermsCollected(WalProposer *wp) +{ + return wp->n_connected >= wp->quorum; } static void RecvAcceptorGreeting(Safekeeper *sk) { WalProposer *wp = sk->wp; + char *mconf_toml; /* * If our reading doesn't immediately succeed, any necessary error @@ -685,10 +747,25 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; - wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term); + mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf); + wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT, + sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term); + pfree(mconf_toml); + + /* + * Adopt mconf of safekeepers if it is higher. TODO: mconf change should + * restart wp if it started voting. + */ + if (sk->greetResponse.mconf.generation > wp->mconf.generation) + { + MembershipConfigurationFree(&wp->mconf); + MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf); + /* full conf was just logged above */ + wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation); + } /* Protocol is all good, move to voting. */ - sk->state = SS_VOTING; + sk->state = SS_WAIT_VOTING; /* * Note: it would be better to track the counter on per safekeeper basis, @@ -696,23 +773,21 @@ RecvAcceptorGreeting(Safekeeper *sk) * as is for now. */ ++wp->n_connected; - if (wp->n_connected <= wp->quorum) + if (wp->state == WPS_COLLECTING_TERMS) { /* We're still collecting terms from the majority. */ wp->propTerm = Max(sk->greetResponse.term, wp->propTerm); /* Quorum is acquried, prepare the vote request. */ - if (wp->n_connected == wp->quorum) + if (TermsCollected(wp)) { wp->propTerm++; wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); - wp->voteRequest = (VoteRequest) - { - .tag = 'v', - .term = wp->propTerm - }; - memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN); + wp->state = WPS_CAMPAIGN; + wp->voteRequest.pam.tag = 'v'; + wp->voteRequest.generation = wp->mconf.generation; + wp->voteRequest.term = wp->propTerm; } } else if (sk->greetResponse.term > wp->propTerm) @@ -724,12 +799,10 @@ RecvAcceptorGreeting(Safekeeper *sk) } /* - * Check if we have quorum. If there aren't enough safekeepers, wait and - * do nothing. We'll eventually get a task when the election starts. - * - * If we do have quorum, we can start an election. + * If we have quorum, start (or just send vote request to newly connected + * node) election, otherwise wait until we have more greetings. */ - if (wp->n_connected < wp->quorum) + if (wp->state == WPS_COLLECTING_TERMS) { /* * SS_VOTING is an idle state; read-ready indicates the connection @@ -744,11 +817,7 @@ RecvAcceptorGreeting(Safekeeper *sk) */ for (int j = 0; j < wp->n_safekeepers; j++) { - /* - * Remember: SS_VOTING indicates that the safekeeper is - * participating in voting, but hasn't sent anything yet. - */ - if (wp->safekeeper[j].state == SS_VOTING) + if (wp->safekeeper[j].state == SS_WAIT_VOTING) SendVoteRequest(&wp->safekeeper[j]); } } @@ -759,12 +828,14 @@ SendVoteRequest(Safekeeper *sk) { WalProposer *wp = sk->wp; - /* We have quorum for voting, send our vote request */ - wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); - /* On failure, logging & resetting is handled */ - if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT)) - return; + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest, + &sk->outbuf, wp->config->proto_version); + /* We have quorum for voting, send our vote request */ + wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port, + wp->voteRequest.generation, wp->voteRequest.term); + /* On failure, logging & resetting is handled */ + BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT); /* If successful, wait for read-ready with SS_WAIT_VERDICT */ } @@ -773,16 +844,19 @@ RecvVoteResponse(Safekeeper *sk) { WalProposer *wp = sk->wp; + Assert(wp->state >= WPS_CAMPAIGN); + sk->voteResponse.apm.tag = 'v'; if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) return; wp_log(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, + sk->voteResponse.voteGiven, + GetHighestTerm(&sk->voteResponse.termHistory), LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn)); /* * In case of acceptor rejecting our vote, bail out, but only if either it @@ -790,7 +864,7 @@ RecvVoteResponse(Safekeeper *sk) * we are not elected yet and thus need the vote. */ if ((!sk->voteResponse.voteGiven) && - (sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum)) + (sk->voteResponse.term > wp->propTerm || wp->state == WPS_CAMPAIGN)) { wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", sk->host, sk->port, @@ -798,38 +872,83 @@ RecvVoteResponse(Safekeeper *sk) } Assert(sk->voteResponse.term == wp->propTerm); - /* Handshake completed, do we have quorum? */ + /* ready for elected message */ + sk->state = SS_WAIT_ELECTED; + wp->n_votes++; - if (wp->n_votes < wp->quorum) + /* Are we already elected? */ + if (wp->state == WPS_CAMPAIGN) { - sk->state = SS_IDLE; /* can't do much yet, no quorum */ - } - else if (wp->n_votes > wp->quorum) - { - /* already elected, start streaming */ - SendProposerElected(sk); + /* no; check if this vote makes us elected */ + if (VotesCollected(wp)) + { + wp->state = WPS_ELECTED; + HandleElectedProposer(wp); + } + else + { + /* can't do much yet, no quorum */ + return; + } } else { - sk->state = SS_IDLE; - /* Idle state waits for read-ready events */ - wp->api.update_event_set(sk, WL_SOCKET_READABLE); - - HandleElectedProposer(sk->wp); + Assert(wp->state == WPS_ELECTED); + /* send elected only to this sk */ + SendProposerElected(sk); } } +/* + * Checks if enough votes has been collected to get elected and if that's the + * case finds the highest vote, setting donor, donorLastLogTerm, + * propTermStartLsn fields. Also sets truncateLsn. + */ +static bool +VotesCollected(WalProposer *wp) +{ + int n_ready = 0; + + /* assumed to be called only when not elected yet */ + Assert(wp->state == WPS_CAMPAIGN); + + wp->propTermStartLsn = InvalidXLogRecPtr; + wp->donorLastLogTerm = 0; + wp->truncateLsn = InvalidXLogRecPtr; + + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].state == SS_WAIT_ELECTED) + { + n_ready++; + + if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm || + (GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm && + wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn)) + { + wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]); + wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn; + wp->donor = i; + } + wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); + } + } + + return n_ready >= wp->quorum; +} + /* * Called once a majority of acceptors have voted for us and current proposer * has been elected. * - * Sends ProposerElected message to all acceptors in SS_IDLE state and starts + * Sends ProposerElected message to all acceptors in SS_WAIT_ELECTED state and starts * replication from walsender. */ static void HandleElectedProposer(WalProposer *wp) { - DetermineEpochStartLsn(wp); + ProcessPropStartPos(wp); + Assert(wp->propTermStartLsn != InvalidXLogRecPtr); /* * Synchronously download WAL from the most advanced safekeeper. We do @@ -841,40 +960,24 @@ HandleElectedProposer(WalProposer *wp) wp_log(FATAL, "failed to download WAL for logical replicaiton"); } - /* - * Zero propEpochStartLsn means majority of safekeepers doesn't have any - * WAL, timeline was just created. Compute bumps it to basebackup LSN, - * otherwise we must be sync-safekeepers and we have nothing to do then. - * - * Proceeding is not only pointless but harmful, because we'd give - * safekeepers term history starting with 0/0. These hacks will go away once - * we disable implicit timeline creation on safekeepers and create it with - * non zero LSN from the start. - */ - if (wp->propEpochStartLsn == InvalidXLogRecPtr) - { - Assert(wp->config->syncSafekeepers); - wp_log(LOG, "elected with zero propEpochStartLsn in sync-safekeepers, exiting"); - wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn); - } - - if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers) + if (wp->truncateLsn == wp->propTermStartLsn && wp->config->syncSafekeepers) { /* Sync is not needed: just exit */ - wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn); + wp->api.finish_sync_safekeepers(wp, wp->propTermStartLsn); /* unreachable */ } for (int i = 0; i < wp->n_safekeepers; i++) { - if (wp->safekeeper[i].state == SS_IDLE) + if (wp->safekeeper[i].state == SS_WAIT_ELECTED) SendProposerElected(&wp->safekeeper[i]); } /* * The proposer has been elected, and there will be no quorum waiting - * after this point. There will be no safekeeper with state SS_IDLE also, - * because that state is used only for quorum waiting. + * after this point. There will be no safekeeper with state + * SS_WAIT_ELECTED also, because that state is used only for quorum + * waiting. */ if (wp->config->syncSafekeepers) @@ -891,7 +994,7 @@ HandleElectedProposer(WalProposer *wp) return; } - wp->api.start_streaming(wp, wp->propEpochStartLsn); + wp->api.start_streaming(wp, wp->propTermStartLsn); /* Should not return here */ } @@ -904,7 +1007,7 @@ GetHighestTerm(TermHistory *th) /* safekeeper's epoch is the term of the highest entry in the log */ static term_t -GetEpoch(Safekeeper *sk) +GetLastLogTerm(Safekeeper *sk) { return GetHighestTerm(&sk->voteResponse.termHistory); } @@ -925,98 +1028,52 @@ SkipXLogPageHeader(WalProposer *wp, XLogRecPtr lsn) } /* - * Called after majority of acceptors gave votes, it calculates the most - * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since - * which we'll write WAL in our term. - * - * Sets truncateLsn along the way (though it is not of much use at this point -- - * only for skipping recovery). + * Called after quorum gave votes and proposer starting position (highest vote + * term + flush LSN) -- is determined (VotesCollected true), this function + * adopts it: pushes LSN to shmem, sets wp term history, verifies that the + * basebackup matches. */ static void -DetermineEpochStartLsn(WalProposer *wp) +ProcessPropStartPos(WalProposer *wp) { TermHistory *dth; - int n_ready = 0; WalproposerShmemState *walprop_shared; - wp->propEpochStartLsn = InvalidXLogRecPtr; - wp->donorEpoch = 0; - wp->truncateLsn = InvalidXLogRecPtr; - wp->timelineStartLsn = InvalidXLogRecPtr; - - for (int i = 0; i < wp->n_safekeepers; i++) - { - if (wp->safekeeper[i].state == SS_IDLE) - { - n_ready++; - - if (GetEpoch(&wp->safekeeper[i]) > wp->donorEpoch || - (GetEpoch(&wp->safekeeper[i]) == wp->donorEpoch && - wp->safekeeper[i].voteResponse.flushLsn > wp->propEpochStartLsn)) - { - wp->donorEpoch = GetEpoch(&wp->safekeeper[i]); - wp->propEpochStartLsn = wp->safekeeper[i].voteResponse.flushLsn; - wp->donor = i; - } - wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); - - if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) - { - /* timelineStartLsn should be the same everywhere or unknown */ - if (wp->timelineStartLsn != InvalidXLogRecPtr && - wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn) - { - wp_log(WARNING, - "inconsistent timelineStartLsn: current %X/%X, received %X/%X", - LSN_FORMAT_ARGS(wp->timelineStartLsn), - LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); - } - wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn; - } - } - } - - if (n_ready < wp->quorum) - { - /* - * This is a rare case that can be triggered if safekeeper has voted - * and disconnected. In this case, its state will not be SS_IDLE and - * its vote cannot be used, because we clean up `voteResponse` in - * `ShutdownConnection`. - */ - wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready); - } + /* must have collected votes */ + Assert(wp->state == WPS_ELECTED); /* - * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are - * bootstrapping and nothing was committed yet. Start streaming then from - * the basebackup LSN. - */ - if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers) - { - wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp); - if (wp->timelineStartLsn == InvalidXLogRecPtr) - { - wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp); - } - wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); - } - pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn); - - /* - * Safekeepers are setting truncateLsn after timelineStartLsn is known, so - * it should never be zero at this point, if we know timelineStartLsn. + * If propTermStartLsn is 0, it means flushLsn is 0 everywhere, we are + * bootstrapping and nothing was committed yet. Start streaming from the + * basebackup LSN then. * - * timelineStartLsn can be zero only on the first syncSafekeepers run. + * In case of sync-safekeepers just exit: proceeding is not only pointless + * but harmful, because we'd give safekeepers term history starting with + * 0/0. These hacks will go away once we disable implicit timeline + * creation on safekeepers and create it with non zero LSN from the start. */ - Assert((wp->truncateLsn != InvalidXLogRecPtr) || - (wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn)); + if (wp->propTermStartLsn == InvalidXLogRecPtr) + { + if (!wp->config->syncSafekeepers) + { + wp->propTermStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp); + wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propTermStartLsn)); + } + else + { + wp_log(LOG, "elected with zero propTermStartLsn in sync-safekeepers, exiting"); + wp->api.finish_sync_safekeepers(wp, wp->propTermStartLsn); + } + } + pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propTermStartLsn); + + Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers); /* - * We will be generating WAL since propEpochStartLsn, so we should set + * We will be generating WAL since propTermStartLsn, so we should set * availableLsn to mark this LSN as the latest available position. */ - wp->availableLsn = wp->propEpochStartLsn; + wp->availableLsn = wp->propTermStartLsn; /* * Proposer's term history is the donor's + its own entry. @@ -1027,12 +1084,12 @@ DetermineEpochStartLsn(WalProposer *wp) if (dth->n_entries > 0) memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; - wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn; + wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn; wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", wp->quorum, wp->propTerm, - LSN_FORMAT_ARGS(wp->propEpochStartLsn), + LSN_FORMAT_ARGS(wp->propTermStartLsn), wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, LSN_FORMAT_ARGS(wp->truncateLsn)); @@ -1050,13 +1107,14 @@ DetermineEpochStartLsn(WalProposer *wp) * Safekeepers don't skip header as they need continious stream of * data, so correct LSN for comparison. */ - if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp)) + if (SkipXLogPageHeader(wp, wp->propTermStartLsn) != wp->api.get_redo_start_lsn(wp)) { /* - * However, allow to proceed if last_log_term on the node which gave - * the highest vote (i.e. point where we are going to start writing) - * actually had been won by me; plain restart of walproposer not - * intervened by concurrent compute which wrote WAL is ok. + * However, allow to proceed if last_log_term on the node which + * gave the highest vote (i.e. point where we are going to start + * writing) actually had been won by me; plain restart of + * walproposer not intervened by concurrent compute which wrote + * WAL is ok. * * This avoids compute crash after manual term_bump. */ @@ -1070,8 +1128,8 @@ DetermineEpochStartLsn(WalProposer *wp) */ disable_core_dump(); wp_log(PANIC, - "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", - LSN_FORMAT_ARGS(wp->propEpochStartLsn), + "collected propTermStartLsn %X/%X, but basebackup LSN %X/%X", + LSN_FORMAT_ARGS(wp->propTermStartLsn), LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); } } @@ -1126,14 +1184,8 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u", - sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); - - /* - * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline - * is created manually (test_s3_wal_replay) - */ - Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u", + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries); } else { @@ -1158,29 +1210,19 @@ SendProposerElected(Safekeeper *sk) Assert(sk->startStreamingAt <= wp->availableLsn); - msg.tag = 'e'; + msg.apm.tag = 'e'; + msg.generation = wp->mconf.generation; msg.term = wp->propTerm; msg.startStreamingAt = sk->startStreamingAt; msg.termHistory = &wp->propTermHistory; - msg.timelineStartLsn = wp->timelineStartLsn; lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0; wp_log(LOG, - "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", - sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); - - resetStringInfo(&sk->outbuf); - pq_sendint64_le(&sk->outbuf, msg.tag); - pq_sendint64_le(&sk->outbuf, msg.term); - pq_sendint64_le(&sk->outbuf, msg.startStreamingAt); - pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries); - for (int i = 0; i < msg.termHistory->n_entries; i++) - { - pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term); - pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn); - } - pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn); + "sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s", + sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), + lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port); + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version); if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) return; @@ -1246,14 +1288,13 @@ static void PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) { Assert(endLsn >= beginLsn); - req->tag = 'a'; + req->apm.tag = 'a'; + req->generation = wp->mconf.generation; req->term = wp->propTerm; - req->epochStartLsn = wp->propEpochStartLsn; req->beginLsn = beginLsn; req->endLsn = endLsn; req->commitLsn = wp->commitLsn; req->truncateLsn = wp->truncateLsn; - req->proposerId = wp->greetRequest.proposerId; } /* @@ -1354,7 +1395,8 @@ SendAppendRequests(Safekeeper *sk) resetStringInfo(&sk->outbuf); /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); + PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version); + /* prepare for reading WAL into the outbuf */ enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); sk->active_state = SS_ACTIVE_READ_WAL; } @@ -1367,14 +1409,17 @@ SendAppendRequests(Safekeeper *sk) req = &sk->appendRequest; req_len = req->endLsn - req->beginLsn; - /* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */ + /* + * We send zero sized AppenRequests as heartbeats; don't wal_read + * for these. + */ if (req_len > 0) { switch (wp->api.wal_read(sk, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req_len, - &errmsg)) + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req_len, + &errmsg)) { case NEON_WALREAD_SUCCESS: break; @@ -1382,7 +1427,7 @@ SendAppendRequests(Safekeeper *sk) return true; case NEON_WALREAD_ERROR: wp_log(WARNING, "WAL reading for node %s:%s failed: %s", - sk->host, sk->port, errmsg); + sk->host, sk->port, errmsg); ShutdownConnection(sk); return false; default: @@ -1470,11 +1515,11 @@ RecvAppendResponses(Safekeeper *sk) * Term has changed to higher one, probably another compute is * running. If this is the case we could PANIC as well because * likely it inserted some data and our basebackup is unsuitable - * anymore. However, we also bump term manually (term_bump endpoint) - * on safekeepers for migration purposes, in this case we do want - * compute to stay alive. So restart walproposer with FATAL instead - * of panicking; if basebackup is spoiled next election will notice - * this. + * anymore. However, we also bump term manually (term_bump + * endpoint) on safekeepers for migration purposes, in this case + * we do want compute to stay alive. So restart walproposer with + * FATAL instead of panicking; if basebackup is spoiled next + * election will notice this. */ wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", sk->host, sk->port, @@ -1509,7 +1554,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese for (i = 0; i < nkeys; i++) { - const char *key = pq_getmsgstring(reply_message); + const char *key = pq_getmsgrawstring(reply_message); unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32)); if (strcmp(key, "current_timeline_size") == 0) @@ -1595,7 +1640,7 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp) * Like in Raft, we aren't allowed to commit entries from previous * terms, so ignore reported LSN until it gets to epochStartLsn. */ - responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propEpochStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; + responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; } qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); @@ -1628,10 +1673,10 @@ UpdateDonorShmem(WalProposer *wp) * about its position immediately after election before any feedbacks are * sent. */ - if (wp->safekeeper[wp->donor].state >= SS_IDLE) + if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED) { donor = &wp->safekeeper[wp->donor]; - donor_lsn = wp->propEpochStartLsn; + donor_lsn = wp->propTermStartLsn; } /* @@ -1720,7 +1765,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) for (int i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; - bool synced = sk->appendResponse.commitLsn >= wp->propEpochStartLsn; + bool synced = sk->appendResponse.commitLsn >= wp->propTermStartLsn; /* alive safekeeper which is not synced yet; wait for it */ if (sk->state != SS_OFFLINE && !synced) @@ -1744,12 +1789,220 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) */ BroadcastAppendRequest(wp); - wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn); + wp->api.finish_sync_safekeepers(wp, wp->propTermStartLsn); /* unreachable */ } } } +/* Serialize MembershipConfiguration into buf. */ +static void +MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf) +{ + uint32 i; + + pq_sendint32(buf, mconf->generation); + + pq_sendint32(buf, mconf->members.len); + for (i = 0; i < mconf->members.len; i++) + { + pq_sendint64(buf, mconf->members.m[i].node_id); + pq_send_ascii_string(buf, mconf->members.m[i].host); + pq_sendint16(buf, mconf->members.m[i].port); + } + + /* + * There is no special mark for absent new_members; zero members in + * invalid, so zero len means absent. + */ + pq_sendint32(buf, mconf->new_members.len); + for (i = 0; i < mconf->new_members.len; i++) + { + pq_sendint64(buf, mconf->new_members.m[i].node_id); + pq_send_ascii_string(buf, mconf->new_members.m[i].host); + pq_sendint16(buf, mconf->new_members.m[i].port); + } +} + +/* Serialize proposer -> acceptor message into buf using specified version */ +static void +PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version) +{ + /* both version are supported currently until we fully migrate to 3 */ + Assert(proto_version == 3 || proto_version == 2); + + resetStringInfo(buf); + + if (proto_version == 3) + { + /* + * v2 sends structs for some messages as is, so commonly send tag only + * for v3 + */ + pq_sendint8(buf, msg->tag); + + switch (msg->tag) + { + case 'g': + { + ProposerGreeting *m = (ProposerGreeting *) msg; + + pq_send_ascii_string(buf, m->tenant_id); + pq_send_ascii_string(buf, m->timeline_id); + MembershipConfigurationSerialize(&m->mconf, buf); + pq_sendint32(buf, m->pg_version); + pq_sendint64(buf, m->system_id); + pq_sendint32(buf, m->wal_seg_size); + break; + } + case 'v': + { + VoteRequest *m = (VoteRequest *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + break; + + } + case 'e': + { + ProposerElected *m = (ProposerElected *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + pq_sendint64(buf, m->startStreamingAt); + pq_sendint32(buf, m->termHistory->n_entries); + for (uint32 i = 0; i < m->termHistory->n_entries; i++) + { + pq_sendint64(buf, m->termHistory->entries[i].term); + pq_sendint64(buf, m->termHistory->entries[i].lsn); + } + break; + } + case 'a': + { + /* + * Note: this serializes only AppendRequestHeader, caller + * is expected to append WAL data later. + */ + AppendRequestHeader *m = (AppendRequestHeader *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + pq_sendint64(buf, m->beginLsn); + pq_sendint64(buf, m->endLsn); + pq_sendint64(buf, m->commitLsn); + pq_sendint64(buf, m->truncateLsn); + break; + } + default: + wp_log(FATAL, "unexpected message type %c to serialize", msg->tag); + } + return; + } + + if (proto_version == 2) + { + switch (msg->tag) + { + case 'g': + { + /* v2 sent struct as is */ + ProposerGreeting *m = (ProposerGreeting *) msg; + ProposerGreetingV2 greetRequestV2; + + /* Fill also v2 struct. */ + greetRequestV2.tag = 'g'; + greetRequestV2.protocolVersion = proto_version; + greetRequestV2.pgVersion = m->pg_version; + + /* + * v3 removed this field because it's easier to pass as + * libq or START_WAL_PUSH options + */ + memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId)); + greetRequestV2.systemId = wp->config->systemId; + if (*m->timeline_id != '\0' && + !HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16)) + wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id); + if (*m->tenant_id != '\0' && + !HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16)) + wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id); + + greetRequestV2.timeline = wp->config->pgTimeline; + greetRequestV2.walSegSize = wp->config->wal_segment_size; + + pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2)); + break; + } + case 'v': + { + /* v2 sent struct as is */ + VoteRequest *m = (VoteRequest *) msg; + VoteRequestV2 voteRequestV2; + + voteRequestV2.tag = m->pam.tag; + voteRequestV2.term = m->term; + /* removed field */ + memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId)); + pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2)); + break; + } + case 'e': + { + ProposerElected *m = (ProposerElected *) msg; + + pq_sendint64_le(buf, m->apm.tag); + pq_sendint64_le(buf, m->term); + pq_sendint64_le(buf, m->startStreamingAt); + pq_sendint32_le(buf, m->termHistory->n_entries); + for (int i = 0; i < m->termHistory->n_entries; i++) + { + pq_sendint64_le(buf, m->termHistory->entries[i].term); + pq_sendint64_le(buf, m->termHistory->entries[i].lsn); + } + + /* + * Removed timeline_start_lsn. Still send it as a valid + * value until safekeepers taking it from term history are + * deployed. + */ + pq_sendint64_le(buf, m->termHistory->entries[0].lsn); + break; + } + case 'a': + + /* + * Note: this serializes only AppendRequestHeader, caller is + * expected to append WAL data later. + */ + { + /* v2 sent struct as is */ + AppendRequestHeader *m = (AppendRequestHeader *) msg; + AppendRequestHeaderV2 appendRequestHeaderV2; + + appendRequestHeaderV2.tag = m->apm.tag; + appendRequestHeaderV2.term = m->term; + appendRequestHeaderV2.epochStartLsn = 0; /* removed field */ + appendRequestHeaderV2.beginLsn = m->beginLsn; + appendRequestHeaderV2.endLsn = m->endLsn; + appendRequestHeaderV2.commitLsn = m->commitLsn; + appendRequestHeaderV2.truncateLsn = m->truncateLsn; + /* removed field */ + memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId)); + + pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2)); + break; + } + + default: + wp_log(FATAL, "unexpected message type %c to serialize", msg->tag); + } + return; + } + wp_log(FATAL, "unexpected proto_version %d", proto_version); +} + /* * Try to read CopyData message from i'th safekeeper, resetting connection on * failure. @@ -1779,6 +2032,37 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) return false; } +/* Deserialize membership configuration from buf to mconf. */ +static void +MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf) +{ + uint32 i; + + mconf->generation = pq_getmsgint32(buf); + mconf->members.len = pq_getmsgint32(buf); + mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len); + for (i = 0; i < mconf->members.len; i++) + { + const char *buf_host; + + mconf->members.m[i].node_id = pq_getmsgint64(buf); + buf_host = pq_getmsgrawstring(buf); + strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host)); + mconf->members.m[i].port = pq_getmsgint16(buf); + } + mconf->new_members.len = pq_getmsgint32(buf); + mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len); + for (i = 0; i < mconf->new_members.len; i++) + { + const char *buf_host; + + mconf->new_members.m[i].node_id = pq_getmsgint64(buf); + buf_host = pq_getmsgrawstring(buf); + strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host)); + mconf->new_members.m[i].port = pq_getmsgint16(buf); + } +} + /* * Read next message with known type into provided struct, by reading a CopyData * block from the safekeeper's postgres connection, returning whether the read @@ -1787,6 +2071,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) * If the read needs more polling, we return 'false' and keep the state * unmodified, waiting until it becomes read-ready to try again. If it fully * failed, a warning is emitted and the connection is reset. + * + * Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields. */ static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) @@ -1795,82 +2081,154 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) char *buf; int buf_size; - uint64 tag; + uint8 tag; StringInfoData s; if (!(AsyncRead(sk, &buf, &buf_size))) return false; + sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); /* parse it */ s.data = buf; s.len = buf_size; + s.maxlen = buf_size; s.cursor = 0; - tag = pq_getmsgint64_le(&s); - if (tag != anymsg->tag) + if (wp->config->proto_version == 3) { - wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, - sk->port, FormatSafekeeperState(sk)); - ResetConnection(sk); - return false; - } - sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); - switch (tag) - { - case 'g': - { - AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->nodeId = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } - - case 'v': - { - VoteResponse *msg = (VoteResponse *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->voteGiven = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->truncateLsn = pq_getmsgint64_le(&s); - msg->termHistory.n_entries = pq_getmsgint32_le(&s); - msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); - for (int i = 0; i < msg->termHistory.n_entries; i++) + tag = pq_getmsgbyte(&s); + if (tag != anymsg->tag) + { + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); + ResetConnection(sk); + return false; + } + switch (tag) + { + case 'g': { - msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); - msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->nodeId = pq_getmsgint64(&s); + MembershipConfigurationDeserialize(&msg->mconf, &s); + msg->term = pq_getmsgint64(&s); + pq_getmsgend(&s); + return true; } - msg->timelineStartLsn = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; - case 'a': - { - AppendResponse *msg = (AppendResponse *) anymsg; + msg->generation = pq_getmsgint32(&s); + msg->term = pq_getmsgint64(&s); + msg->voteGiven = pq_getmsgbyte(&s); + msg->flushLsn = pq_getmsgint64(&s); + msg->truncateLsn = pq_getmsgint64(&s); + msg->termHistory.n_entries = pq_getmsgint32(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (uint32 i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64(&s); + } + pq_getmsgend(&s); + return true; + } + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->commitLsn = pq_getmsgint64_le(&s); - msg->hs.ts = pq_getmsgint64_le(&s); - msg->hs.xmin.value = pq_getmsgint64_le(&s); - msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (s.len > s.cursor) - ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); - else - msg->ps_feedback.present = false; - pq_getmsgend(&s); - return true; - } - - default: - { - Assert(false); - return false; - } + msg->generation = pq_getmsgint32(&s); + msg->term = pq_getmsgint64(&s); + msg->flushLsn = pq_getmsgint64(&s); + msg->commitLsn = pq_getmsgint64(&s); + msg->hs.ts = pq_getmsgint64(&s); + msg->hs.xmin.value = pq_getmsgint64(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64(&s); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; + pq_getmsgend(&s); + return true; + } + default: + { + wp_log(FATAL, "unexpected message tag %c to read", (char) tag); + return false; + } + } } + else if (wp->config->proto_version == 2) + { + tag = pq_getmsgint64_le(&s); + if (tag != anymsg->tag) + { + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); + ResetConnection(sk); + return false; + } + switch (tag) + { + case 'g': + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + pq_getmsgint64_le(&s); /* timelineStartLsn */ + pq_getmsgend(&s); + return true; + } + + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; + pq_getmsgend(&s); + return true; + } + + default: + { + wp_log(FATAL, "unexpected message tag %c to read", (char) tag); + return false; + } + } + } + wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version); + return false; /* keep the compiler quiet */ } /* @@ -2037,7 +2395,7 @@ FormatSafekeeperState(Safekeeper *sk) case SS_HANDSHAKE_RECV: return_val = "handshake (receiving)"; break; - case SS_VOTING: + case SS_WAIT_VOTING: return_val = "voting"; break; case SS_WAIT_VERDICT: @@ -2046,7 +2404,7 @@ FormatSafekeeperState(Safekeeper *sk) case SS_SEND_ELECTED_FLUSH: return_val = "send-announcement-flush"; break; - case SS_IDLE: + case SS_WAIT_ELECTED: return_val = "idle"; break; case SS_ACTIVE: @@ -2135,8 +2493,8 @@ SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_even * Idle states use read-readiness as a sign that the connection * has been disconnected. */ - case SS_VOTING: - case SS_IDLE: + case SS_WAIT_VOTING: + case SS_WAIT_ELECTED: *sk_events = WL_SOCKET_READABLE; return; @@ -2246,3 +2604,57 @@ FormatEvents(WalProposer *wp, uint32 events) return (char *) &return_str; } + +/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */ +static char * +MembershipConfigurationToString(MembershipConfiguration *mconf) +{ + StringInfoData s; + uint32 i; + + initStringInfo(&s); + appendStringInfo(&s, "{gen = %u", mconf->generation); + appendStringInfoString(&s, ", members = ["); + for (i = 0; i < mconf->members.len; i++) + { + if (i > 0) + appendStringInfoString(&s, ", "); + appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id); + appendStringInfo(&s, ", host = %s", mconf->members.m[i].host); + appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port); + } + appendStringInfo(&s, "], new_members = ["); + for (i = 0; i < mconf->new_members.len; i++) + { + if (i > 0) + appendStringInfoString(&s, ", "); + appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id); + appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host); + appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port); + } + appendStringInfoString(&s, "]}"); + return s.data; +} + +static void +MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst) +{ + dst->generation = src->generation; + dst->members.len = src->members.len; + dst->members.m = palloc0(sizeof(SafekeeperId) * dst->members.len); + memcpy(dst->members.m, src->members.m, sizeof(SafekeeperId) * dst->members.len); + dst->new_members.len = src->new_members.len; + dst->new_members.m = palloc0(sizeof(SafekeeperId) * dst->new_members.len); + memcpy(dst->new_members.m, src->new_members.m, sizeof(SafekeeperId) * dst->new_members.len); +} + +static void +MembershipConfigurationFree(MembershipConfiguration *mconf) +{ + if (mconf->members.m) + pfree(mconf->members.m); + mconf->members.m = NULL; + if (mconf->new_members.m) + pfree(mconf->new_members.m); + mconf->new_members.m = NULL; +} diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index d8c44f8182..d116bce806 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -12,9 +12,6 @@ #include "neon_walreader.h" #include "pagestore_client.h" -#define SK_MAGIC 0xCafeCeefu -#define SK_PROTOCOL_VERSION 2 - #define MAX_SAFEKEEPERS 32 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL * message */ @@ -76,12 +73,12 @@ typedef enum * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a * quorum of handshakes. */ - SS_VOTING, + SS_WAIT_VOTING, /* * Already sent voting information, waiting to receive confirmation from - * the node. After receiving, moves to SS_IDLE, if the quorum isn't - * reached yet. + * the node. After receiving, moves to SS_WAIT_ELECTED, if the quorum + * isn't reached yet. */ SS_WAIT_VERDICT, @@ -94,7 +91,7 @@ typedef enum * * Moves to SS_ACTIVE only by call to StartStreaming. */ - SS_IDLE, + SS_WAIT_ELECTED, /* * Active phase, when we acquired quorum and have WAL to send or feedback @@ -143,12 +140,74 @@ typedef uint64 term_t; /* neon storage node id */ typedef uint64 NNodeId; +/* + * Number uniquely identifying safekeeper membership configuration. + * This and following structs pair ones in membership.rs. + */ +typedef uint32 Generation; + +typedef struct SafekeeperId +{ + NNodeId node_id; + char host[MAXCONNINFO]; + uint16 port; +} SafekeeperId; + +/* Set of safekeepers. */ +typedef struct MemberSet +{ + uint32 len; /* number of members */ + SafekeeperId *m; /* ids themselves */ +} MemberSet; + +/* + * Timeline safekeeper membership configuration as sent in the + * protocol. + */ +typedef struct MembershipConfiguration +{ + Generation generation; + MemberSet members; + /* Has 0 n_members in non joint conf. */ + MemberSet new_members; +} MembershipConfiguration; + /* * Proposer <-> Acceptor messaging. */ +typedef struct ProposerAcceptorMessage +{ + uint8 tag; +} ProposerAcceptorMessage; + /* Initial Proposer -> Acceptor message */ typedef struct ProposerGreeting +{ + ProposerAcceptorMessage pam; /* message tag */ + + /* + * tenant/timeline ids as C strings with standard hex notation for ease of + * printing. In principle they are not strictly needed as ttid is also + * passed as libpq options. + */ + char *tenant_id; + char *timeline_id; + /* Full conf is carried to allow safekeeper switch */ + MembershipConfiguration mconf; + + /* + * pg_version and wal_seg_size are used for timeline creation until we + * fully migrate to doing externally. systemId is only used as a sanity + * cross check. + */ + uint32 pg_version; /* in PG_VERSION_NUM format */ + uint64 system_id; /* Postgres system identifier. */ + uint32 wal_seg_size; +} ProposerGreeting; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct ProposerGreetingV2 { uint64 tag; /* message tag */ uint32 protocolVersion; /* proposer-safekeeper protocol version */ @@ -159,32 +218,42 @@ typedef struct ProposerGreeting uint8 tenant_id[16]; TimeLineID timeline; uint32 walSegSize; -} ProposerGreeting; +} ProposerGreetingV2; typedef struct AcceptorProposerMessage { - uint64 tag; + uint8 tag; } AcceptorProposerMessage; /* - * Acceptor -> Proposer initial response: the highest term acceptor voted for. + * Acceptor -> Proposer initial response: the highest term acceptor voted for, + * its node id and configuration. */ typedef struct AcceptorGreeting { AcceptorProposerMessage apm; - term_t term; NNodeId nodeId; + MembershipConfiguration mconf; + term_t term; } AcceptorGreeting; /* * Proposer -> Acceptor vote request. */ typedef struct VoteRequest +{ + ProposerAcceptorMessage pam; /* message tag */ + Generation generation; /* membership conf generation */ + term_t term; +} VoteRequest; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct VoteRequestV2 { uint64 tag; term_t term; pg_uuid_t proposerId; /* for monitoring/debugging */ -} VoteRequest; +} VoteRequestV2; /* Element of term switching chain. */ typedef struct TermSwitchEntry @@ -203,8 +272,15 @@ typedef struct TermHistory typedef struct VoteResponse { AcceptorProposerMessage apm; + + /* + * Membership conf generation. It's not strictly required because on + * mismatch safekeeper is expected to ERROR the connection, but let's + * sanity check it. + */ + Generation generation; term_t term; - uint64 voteGiven; + uint8 voteGiven; /* * Safekeeper flush_lsn (end of WAL) + history of term switches allow @@ -214,7 +290,6 @@ typedef struct VoteResponse XLogRecPtr truncateLsn; /* minimal LSN which may be needed for* * recovery of some safekeeper */ TermHistory termHistory; - XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ } VoteResponse; /* @@ -223,20 +298,37 @@ typedef struct VoteResponse */ typedef struct ProposerElected { - uint64 tag; + AcceptorProposerMessage apm; + Generation generation; /* membership conf generation */ term_t term; /* proposer will send since this point */ XLogRecPtr startStreamingAt; /* history of term switches up to this proposer */ TermHistory *termHistory; - /* timeline globally starts at this LSN */ - XLogRecPtr timelineStartLsn; } ProposerElected; /* * Header of request with WAL message sent from proposer to safekeeper. */ typedef struct AppendRequestHeader +{ + AcceptorProposerMessage apm; + Generation generation; /* membership conf generation */ + term_t term; /* term of the proposer */ + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + + /* + * minimal LSN which may be needed for recovery of some safekeeper (end + * lsn + 1 of last chunk streamed to everyone) + */ + XLogRecPtr truncateLsn; + /* in the AppendRequest message, WAL data follows */ +} AppendRequestHeader; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct AppendRequestHeaderV2 { uint64 tag; term_t term; /* term of the proposer */ @@ -256,7 +348,8 @@ typedef struct AppendRequestHeader */ XLogRecPtr truncateLsn; pg_uuid_t proposerId; /* for monitoring/debugging */ -} AppendRequestHeader; + /* in the AppendRequest message, WAL data follows */ +} AppendRequestHeaderV2; /* * Hot standby feedback received from replica @@ -309,6 +402,13 @@ typedef struct AppendResponse { AcceptorProposerMessage apm; + /* + * Membership conf generation. It's not strictly required because on + * mismatch safekeeper is expected to ERROR the connection, but let's + * sanity check it. + */ + Generation generation; + /* * Current term of the safekeeper; if it is higher than proposer's, the * compute is out of date. @@ -644,11 +744,22 @@ typedef struct WalProposerConfig /* Will be passed to safekeepers in greet request. */ TimeLineID pgTimeline; + int proto_version; + #ifdef WALPROPOSER_LIB void *callback_data; #endif } WalProposerConfig; +typedef enum +{ + /* collecting greetings to determine term to campaign for */ + WPS_COLLECTING_TERMS, + /* campaing started, waiting for votes */ + WPS_CAMPAIGN, + /* successfully elected */ + WPS_ELECTED, +} WalProposerState; /* * WAL proposer state. @@ -656,11 +767,29 @@ typedef struct WalProposerConfig typedef struct WalProposer { WalProposerConfig *config; - int n_safekeepers; + WalProposerState state; + /* Current walproposer membership configuration */ + MembershipConfiguration mconf; /* (n_safekeepers / 2) + 1 */ int quorum; + /* + * Generation of the membership conf of which safekeepers[] are presumably + * members. To make cplane life a bit easier and have more control in + * tests with which sks walproposer gets connected neon.safekeepers GUC + * doesn't provide full mconf, only the list of endpoints to connect to. + * We still would like to know generation associated with it because 1) we + * need some handle to enforce using generations in walproposer, and + * non-zero value of this serves the purpose; 2) currently we don't do + * that, but in theory walproposer can update list of safekeepers to + * connect to upon receiving mconf from safekeepers, and generation number + * must be checked to see which list is newer. + */ + Generation safekeepers_generation; + /* Number of occupied slots in safekeepers[] */ + int n_safekeepers; + /* Safekeepers walproposer is connecting to. */ Safekeeper safekeeper[MAX_SAFEKEEPERS]; /* WAL has been generated up to this point */ @@ -670,6 +799,7 @@ typedef struct WalProposer XLogRecPtr commitLsn; ProposerGreeting greetRequest; + ProposerGreetingV2 greetRequestV2; /* Vote request for safekeeper */ VoteRequest voteRequest; @@ -693,10 +823,10 @@ typedef struct WalProposer TermHistory propTermHistory; /* epoch start lsn of the proposer */ - XLogRecPtr propEpochStartLsn; + XLogRecPtr propTermStartLsn; /* Most advanced acceptor epoch */ - term_t donorEpoch; + term_t donorLastLogTerm; /* Most advanced acceptor */ int donor; diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c index 35d984c52e..a986160224 100644 --- a/pgxn/neon/walproposer_compat.c +++ b/pgxn/neon/walproposer_compat.c @@ -117,14 +117,13 @@ pq_getmsgbytes(StringInfo msg, int datalen) } /* -------------------------------- - * pq_getmsgstring - get a null-terminated text string (with conversion) + * pq_getmsgrawstring - get a null-terminated text string - NO conversion * - * May return a pointer directly into the message buffer, or a pointer - * to a palloc'd conversion result. + * Returns a pointer directly into the message buffer. * -------------------------------- */ const char * -pq_getmsgstring(StringInfo msg) +pq_getmsgrawstring(StringInfo msg) { char *str; int slen; @@ -155,6 +154,45 @@ pq_getmsgend(StringInfo msg) ExceptionalCondition("invalid msg format", __FILE__, __LINE__); } +/* -------------------------------- + * pq_sendbytes - append raw data to a StringInfo buffer + * -------------------------------- + */ +void +pq_sendbytes(StringInfo buf, const void *data, int datalen) +{ + /* use variant that maintains a trailing null-byte, out of caution */ + appendBinaryStringInfo(buf, data, datalen); +} + +/* -------------------------------- + * pq_send_ascii_string - append a null-terminated text string (without conversion) + * + * This function intentionally bypasses encoding conversion, instead just + * silently replacing any non-7-bit-ASCII characters with question marks. + * It is used only when we are having trouble sending an error message to + * the client with normal localization and encoding conversion. The caller + * should already have taken measures to ensure the string is just ASCII; + * the extra work here is just to make certain we don't send a badly encoded + * string to the client (which might or might not be robust about that). + * + * NB: passed text string must be null-terminated, and so is the data + * sent to the frontend. + * -------------------------------- + */ +void +pq_send_ascii_string(StringInfo buf, const char *str) +{ + while (*str) + { + char ch = *str++; + + if (IS_HIGHBIT_SET(ch)) + ch = '?'; + appendStringInfoCharMacro(buf, ch); + } + appendStringInfoChar(buf, '\0'); +} /* * Produce a C-string representation of a TimestampTz. diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 86444084ff..9c34c90002 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -59,9 +59,11 @@ #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" +/* GUCs */ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; +int safekeeper_proto_version = 2; /* Set to true in the walproposer bgw. */ static bool am_walproposer; @@ -126,6 +128,7 @@ init_walprop_config(bool syncSafekeepers) else walprop_config.systemId = 0; walprop_config.pgTimeline = walprop_pg_get_timeline_id(); + walprop_config.proto_version = safekeeper_proto_version; } /* @@ -219,25 +222,37 @@ nwp_register_gucs(void) PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); + + DefineCustomIntVariable( + "neon.safekeeper_proto_version", + "Version of compute <-> safekeeper protocol.", + "Used while migrating from 2 to 3.", + &safekeeper_proto_version, + 2, 0, INT_MAX, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); } static int split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) { - int n_safekeepers = 0; - char *curr_sk = safekeepers_list; + int n_safekeepers = 0; + char *curr_sk = safekeepers_list; for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma) { - if (++n_safekeepers >= MAX_SAFEKEEPERS) { + if (++n_safekeepers >= MAX_SAFEKEEPERS) + { wpg_log(FATAL, "too many safekeepers"); } coma = strchr(coma, ','); - safekeepers[n_safekeepers-1] = curr_sk; + safekeepers[n_safekeepers - 1] = curr_sk; - if (coma != NULL) { + if (coma != NULL) + { *coma++ = '\0'; } } @@ -252,10 +267,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) static bool safekeepers_cmp(char *old, char *new) { - char *safekeepers_old[MAX_SAFEKEEPERS]; - char *safekeepers_new[MAX_SAFEKEEPERS]; - int len_old = 0; - int len_new = 0; + char *safekeepers_old[MAX_SAFEKEEPERS]; + char *safekeepers_new[MAX_SAFEKEEPERS]; + int len_old = 0; + int len_new = 0; len_old = split_safekeepers_list(old, safekeepers_old); len_new = split_safekeepers_list(new, safekeepers_new); @@ -292,7 +307,8 @@ assign_neon_safekeepers(const char *newval, void *extra) if (!am_walproposer) return; - if (!newval) { + if (!newval) + { /* should never happen */ wpg_log(FATAL, "neon.safekeepers is empty"); } @@ -301,11 +317,11 @@ assign_neon_safekeepers(const char *newval, void *extra) newval_copy = pstrdup(newval); oldval = pstrdup(wal_acceptors_list); - /* + /* * TODO: restarting through FATAL is stupid and introduces 1s delay before - * next bgw start. We should refactor walproposer to allow graceful exit and - * thus remove this delay. - * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder. + * next bgw start. We should refactor walproposer to allow graceful exit + * and thus remove this delay. XXX: If you change anything here, sync with + * test_safekeepers_reconfigure_reorder. */ if (!safekeepers_cmp(oldval, newval_copy)) { @@ -454,7 +470,8 @@ backpressure_throttling_impl(void) memcpy(new_status, old_status, len); snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag); set_ps_display(new_status); - new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */ + new_status[len] = '\0'; /* truncate off " backpressure ..." to later + * reset the ps */ elog(DEBUG2, "backpressure throttling: lag %lu", lag); start = GetCurrentTimestamp(); @@ -621,7 +638,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) wpg_log(LOG, "WAL proposer starts streaming at %X/%X", LSN_FORMAT_ARGS(startpos)); cmd.slotname = WAL_PROPOSER_SLOT_NAME; - cmd.timeline = wp->greetRequest.timeline; + cmd.timeline = wp->config->pgTimeline; cmd.startpoint = startpos; StartProposerReplication(wp, &cmd); } @@ -1479,7 +1496,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk) snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); Assert(!sk->xlogreader); - sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, log_prefix); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix); if (sk->xlogreader == NULL) wpg_log(FATAL, "failed to allocate xlog reader"); } @@ -1963,10 +1980,11 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) FullTransactionId xmin = hsFeedback.xmin; FullTransactionId catalog_xmin = hsFeedback.catalog_xmin; FullTransactionId next_xid = ReadNextFullTransactionId(); + /* - * Page server is updating nextXid in checkpoint each 1024 transactions, - * so feedback xmin can be actually larger then nextXid and - * function TransactionIdInRecentPast return false in this case, + * Page server is updating nextXid in checkpoint each 1024 + * transactions, so feedback xmin can be actually larger then nextXid + * and function TransactionIdInRecentPast return false in this case, * preventing update of slot's xmin. */ if (FullTransactionIdPrecedes(next_xid, xmin)) diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index a0fe3822cc..81198d6c8d 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -32,6 +32,8 @@ extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); extern bool GetDonorShmem(XLogRecPtr *donor_lsn); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); +bool disable_wal_prev_lsn_checks = false; + static XLogRecPtr NeonWALReadWaitForWAL(XLogRecPtr loc) { @@ -82,6 +84,8 @@ NeonWALPageRead( if (flushptr < targetPagePtr + reqLen) return -1; + xlogreader->skip_lsn_checks = disable_wal_prev_lsn_checks; + /* Read at most XLOG_BLCKSZ bytes */ if (targetPagePtr + XLOG_BLCKSZ <= flushptr) count = XLOG_BLCKSZ; diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index 74cd5ac601..75b9ab4464 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -32,7 +32,7 @@ #include "inmem_smgr.h" -/* Size of the in-memory smgr */ +/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, so assume that 64 will be enough */ #define MAX_PAGES 64 /* If more than WARN_PAGES are used, print a warning in the log */ @@ -174,10 +174,7 @@ static void inmem_zeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync) { - char buffer[BLCKSZ] = {0}; - - for (int i = 0; i < nblocks; i++) - inmem_extend(reln, forknum, blocknum + i, buffer, skipFsync); + /* Do nothing: inmem_read will return zero page in any case */ } #endif @@ -285,12 +282,12 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * WARN_PAGES, print a warning so that we get alerted and get to * investigate why we're accessing so many buffers. */ - elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, - "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - blocknum, - used_pages); + if (used_pages >= WARN_PAGES) + ereport(WARNING, (errmsg("inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + blocknum, + used_pages), errbacktrace())); if (used_pages == MAX_PAGES) elog(ERROR, "Inmem storage overflow"); diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index 619b7255ae..4673de778c 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -142,7 +142,7 @@ static BufferTag target_redo_tag; static XLogReaderState *reader_state; -#define TRACE LOG +#define TRACE DEBUG1 #ifdef HAVE_LIBSECCOMP @@ -194,6 +194,7 @@ static PgSeccompRule allowed_syscalls[] = * is stored in MyProcPid anyway. */ PG_SCMP_ALLOW(getpid), + PG_SCMP_ALLOW(futex), /* needed for errbacktrace */ /* Enable those for a proper shutdown. */ #if 0 @@ -253,7 +254,7 @@ WalRedoMain(int argc, char *argv[]) * which is super strange but that's not something we can solve * for here. ¯\_(-_-)_/¯ */ - SetConfigOption("log_min_messages", "FATAL", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("log_min_messages", "WARNING", PGC_SUSET, PGC_S_OVERRIDE); SetConfigOption("client_min_messages", "ERROR", PGC_SUSET, PGC_S_OVERRIDE); diff --git a/poetry.lock b/poetry.lock index ba3b0535e4..7c84b2969b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1414,14 +1414,14 @@ files = [ [[package]] name = "jinja2" -version = "3.1.5" +version = "3.1.6" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, - {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, + {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"}, + {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"}, ] [package.dependencies] @@ -1491,14 +1491,38 @@ files = [ [[package]] name = "jsonnet" -version = "0.20.0" -description = "Python bindings for Jsonnet - The data templating language" +version = "0.21.0rc2" +description = "Python bindings for Jsonnet - The data templating language " optional = false python-versions = "*" groups = ["main"] -markers = "python_version < \"3.13\"" files = [ - {file = "jsonnet-0.20.0.tar.gz", hash = "sha256:7e770c7bf3a366b97b650a39430450f77612e74406731eb75c5bd59f3f104d4f"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8779ac6820fee44ef736df2baedc3ae93e8cd5d672ee105015c2a47fe627a727"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:99affe8c71e2551465064a8039bb3d1cba27a0b73b2b9ff1b652e06f17d4ea8b"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a9dffb9aa01013d100ddfb7230d1eeb80f2a8eef712b1825a60cad57106d8bd"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cca6c95f2879dcab52650b7aa09a4e82a139b084931b1f6f8c840f834fecc08a"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-win_amd64.whl", hash = "sha256:016d6afdb302a6d00bf3bce6a0c3d9c093b992e33f9bc67c64a868035892258e"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e893ab2c9bf10d8ec9e9b0cee8961879c88d0619cc6d8f75ea284a78e06ae32b"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06b353cd3daa2781e6cd308e05f2f116396376994bcb5f59aaadbc6a752c7f2"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eb2bc8e62b73101329072da322f7e2a1bdb3ac530b94669128d1b480e311e55"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:113766fd0c25620807bcf04d4c739f461c971a4f0e4aece9ba62b4e762de9598"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-win_amd64.whl", hash = "sha256:8dab208c2c2760be60f87d1ceb8b28c86b51ed0e31129a7d90cd5fe890b41225"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:95f5b9dd26a41d6f258d1baa8d22e557051beeed8c52a6202584f1becca9dcb5"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cecc6d76e2b377260fae0a060097c113e6ac361b8f739903ea7f3f5f64cdebdf"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aaa2d18224af7e63872ef4a101e93962505456cf5f5439c3cfc25dad6845f8b1"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2a9063f811554487ed552445e964aeec969cafb266b965029c8d6b091ce47950"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-win_amd64.whl", hash = "sha256:80d171182c169761f744ba50068a4ad35d48e52b91d25bf4c7bb9a72f0a04f71"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3657938f87cb6bc6da20ca631d437b5faf469ca060a7c7def9c8fd2f25a5e06"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3dcebc30cb991b58bc416ee05e9387004d04716d5c0b89714ff042bd069af5c8"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ac52c95482df3ed93c908468ca2f40d4825b6baba284b395ddc47bd663b8c3a"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b34450823a7a1861de892fef9f29de1b4c19e1a79e27d81ffe7e57646cc89d6"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-win_amd64.whl", hash = "sha256:573fd2580e46f4875ec505f1732f9e804b7063cba790342ed6fdafe9a6b30556"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:871ca1411de3626499bda60b330d37f85a592918f99ba4809089bbb8d4f5bfe4"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5d33b25a9c5bf9099100b9b16cb385a2876d891fbe639ee9d476fc75c861903a"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2bac374565c7f89a4675f19fd2b624ed1376519267f4e444f49b6fc0368f6e5"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:fab7bbd88f9159f88a7350701a97bda24de9e3b9eef14c2501ba8b9224160d60"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-win_amd64.whl", hash = "sha256:ed71ffba0fd233a1bca7b0f7be79730792c5383e562a9dc7da152478d9ee1612"}, + {file = "jsonnet-0.21.0rc2.tar.gz", hash = "sha256:2b83ec4b5a771c3732e0972be23a71f042ad2940db6918d3a52aade69bc394fb"}, ] [[package]] @@ -3820,4 +3844,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "9711c5479c867fa614ce3d352f1bbc63dba1cb2376d347f96fbeda6f512ee308" +content-hash = "715fc8c896dcfa1b15054deeddcdec557ef93af91b26e1c8e4688fe4dbef5296" diff --git a/pre-commit.py b/pre-commit.py index c9567e0c50..09139459d5 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -29,12 +29,12 @@ def colorify( return f"{color.value}{s}{NC}" -def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: - cmd = "rustfmt --edition=2021" +def cargo_fmt(fix_inplace: bool = False, no_color: bool = False) -> str: + cmd = "cargo fmt" if not fix_inplace: cmd += " --check" if no_color: - cmd += " --color=never" + cmd += " -- --color=never" return cmd @@ -61,14 +61,23 @@ def get_commit_files() -> list[str]: return files.decode().splitlines() -def check(name: str, suffix: str, cmd: str, changed_files: list[str], no_color: bool = False): +def check( + name: str, + suffix: str, + cmd: str, + changed_files: list[str], + no_color: bool = False, + append_files_to_cmd: bool = True, +): print(f"Checking: {name} ", end="") applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files)) if not applicable_files: print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color)) return - cmd = f'{cmd} {" ".join(applicable_files)}' + if append_files_to_cmd: + cmd = f"{cmd} {' '.join(applicable_files)}" + res = subprocess.run(cmd.split(), capture_output=True) if res.returncode != 0: print(colorify("[FAILED]", Color.RED, no_color)) @@ -100,15 +109,13 @@ if __name__ == "__main__": args = parser.parse_args() files = get_commit_files() - # we use rustfmt here because cargo fmt does not accept list of files - # it internally gathers project files and feeds them to rustfmt - # so because we want to check only files included in the commit we use rustfmt directly check( - name="rustfmt", + name="cargo fmt", suffix=".rs", - cmd=rustfmt(fix_inplace=args.fix_inplace, no_color=args.no_color), + cmd=cargo_fmt(fix_inplace=args.fix_inplace, no_color=args.no_color), changed_files=files, no_color=args.no_color, + append_files_to_cmd=False, ) check( name="ruff check", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5964b76ecf..b6e3f03a81 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -53,7 +53,7 @@ measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true opentelemetry = { workspace = true, features = ["trace"] } -papaya = "0.1.8" +papaya = "0.2.0" parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 9c3a3772cd..7a6dceb194 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -35,6 +35,7 @@ impl LocalBackend { endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"), project_id: ProjectIdTag::get_interner().get_or_intern("local"), branch_id: BranchIdTag::get_interner().get_or_intern("local"), + compute_id: "local".into(), cold_start_info: ColdStartInfo::WarmCached, }, }, diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index dedd225cba..ee7f6ffcd7 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, bail, ensure}; +use arc_swap::ArcSwapOption; use camino::{Utf8Path, Utf8PathBuf}; use clap::Parser; use compute_api::spec::LocalProxySpec; @@ -27,6 +28,7 @@ use crate::config::{ }; use crate::control_plane::locks::ApiLocks; use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use crate::ext::TaskExt; use crate::http::health_server::AppMetrics; use crate::intern::RoleNameInt; use crate::metrics::{Metrics, ThreadPoolMetrics}; @@ -190,7 +192,11 @@ pub async fn run() -> anyhow::Result<()> { // 2. The config file is written but the signal hook is not yet received // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. refresh_config_notify.notify_one(); - tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); + tokio::spawn(refresh_config_loop( + config, + args.config_path, + refresh_config_notify, + )); maintenance_tasks.spawn(crate::http::health_server::task_main( metrics_listener, @@ -269,7 +275,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }; Ok(Box::leak(Box::new(ProxyConfig { - tls_config: None, + tls_config: ArcSwapOption::from(None), metric_collection: None, http_config, authentication_config: AuthenticationConfig { @@ -311,14 +317,16 @@ enum RefreshConfigError { Parse(#[from] serde_json::Error), #[error(transparent)] Validate(anyhow::Error), + #[error(transparent)] + Tls(anyhow::Error), } -async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { +async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc) { let mut init = true; loop { rx.notified().await; - match refresh_config_inner(&path).await { + match refresh_config_inner(config, &path).await { Ok(()) => {} // don't log for file not found errors if this is the first time we are checking // for computes that don't use local_proxy, this is not an error. @@ -327,6 +335,9 @@ async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { { debug!(error=?e, ?path, "could not read config file"); } + Err(RefreshConfigError::Tls(e)) => { + error!(error=?e, ?path, "could not read TLS certificates"); + } Err(e) => { error!(error=?e, ?path, "could not read config file"); } @@ -336,7 +347,10 @@ async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { } } -async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> { +async fn refresh_config_inner( + config: &ProxyConfig, + path: &Utf8Path, +) -> Result<(), RefreshConfigError> { let bytes = tokio::fs::read(&path).await?; let data: LocalProxySpec = serde_json::from_slice(&bytes)?; @@ -406,5 +420,20 @@ async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> info!("successfully loaded new config"); JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); + if let Some(tls_config) = data.tls { + let tls_config = tokio::task::spawn_blocking(move || { + crate::tls::server_config::configure_tls( + &tls_config.key_path, + &tls_config.cert_path, + None, + false, + ) + }) + .await + .propagate_task_panic() + .map_err(RefreshConfigError::Tls)?; + config.tls_config.store(Some(Arc::new(tls_config))); + } + Ok(()) } diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index eec0bf8f99..feca5ccf88 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use std::time::Duration; use anyhow::bail; +use arc_swap::ArcSwapOption; use futures::future::Either; use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; @@ -563,6 +564,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; + let tls_config = ArcSwapOption::from(tls_config.map(Arc::new)); let backup_metric_collection_config = config::MetricBackupCollectionConfig { remote_storage_config: args.metric_backup_collection_remote_storage.clone(), diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 5447a4a4c0..26254beecf 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,3 +1,4 @@ +use std::fmt::Debug; use std::io; use std::net::SocketAddr; use std::time::Duration; @@ -10,7 +11,7 @@ use postgres_protocol::message::backend::NoticeResponseBody; use pq_proto::StartupMessageParams; use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; -use tokio::net::TcpStream; +use tokio::net::{TcpStream, lookup_host}; use tracing::{debug, error, info, warn}; use crate::auth::backend::ComputeUserInfo; @@ -180,21 +181,19 @@ impl ConnCfg { use postgres_client::config::Host; // wrap TcpStream::connect with timeout - let connect_with_timeout = |host, port| { - tokio::time::timeout(timeout, TcpStream::connect((host, port))).map( - move |res| match res { - Ok(tcpstream_connect_res) => tcpstream_connect_res, - Err(_) => Err(io::Error::new( - io::ErrorKind::TimedOut, - format!("exceeded connection timeout {timeout:?}"), - )), - }, - ) + let connect_with_timeout = |addrs| { + tokio::time::timeout(timeout, TcpStream::connect(addrs)).map(move |res| match res { + Ok(tcpstream_connect_res) => tcpstream_connect_res, + Err(_) => Err(io::Error::new( + io::ErrorKind::TimedOut, + format!("exceeded connection timeout {timeout:?}"), + )), + }) }; - let connect_once = |host, port| { - debug!("trying to connect to compute node at {host}:{port}"); - connect_with_timeout(host, port).and_then(|stream| async { + let connect_once = |addrs| { + debug!("trying to connect to compute node at {addrs:?}"); + connect_with_timeout(addrs).and_then(|stream| async { let socket_addr = stream.peer_addr()?; let socket = socket2::SockRef::from(&stream); // Disable Nagle's algorithm to not introduce latency between @@ -216,7 +215,12 @@ impl ConnCfg { Host::Tcp(host) => host.as_str(), }; - match connect_once(host, port).await { + let addrs = match self.0.get_host_addr() { + Some(addr) => vec![SocketAddr::new(addr, port)], + None => lookup_host((host, port)).await?.collect(), + }; + + match connect_once(&*addrs).await { Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)), Err(err) => { warn!("couldn't connect to compute node at {host}:{port}: {err}"); @@ -277,13 +281,16 @@ impl ConnCfg { } = connection; tracing::Span::current().record("pid", tracing::field::display(process_id)); + tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id)); let stream = stream.into_inner(); // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( cold_start_info = ctx.cold_start_info().as_str(), - "connected to compute node at {host} ({socket_addr}) sslmode={:?}", - self.0.get_ssl_mode() + "connected to compute node at {host} ({socket_addr}) sslmode={:?}, latency={}, query_id={}", + self.0.get_ssl_mode(), + ctx.get_proxy_latency(), + ctx.get_testodrome_id().unwrap_or_default(), ); // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1bcd22e98f..ad398c122c 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Ok, bail, ensure}; +use arc_swap::ArcSwapOption; use clap::ValueEnum; use remote_storage::RemoteStorageConfig; @@ -17,7 +18,7 @@ pub use crate::tls::server_config::{TlsConfig, configure_tls}; use crate::types::Host; pub struct ProxyConfig { - pub tls_config: Option, + pub tls_config: ArcSwapOption, pub metric_collection: Option, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 4662860b3f..1156545f34 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -177,7 +177,8 @@ pub(crate) async fn handle_client( let proto = ctx.protocol(); let request_gauge = metrics.connection_requests.guard(proto); - let tls = config.tls_config.as_ref(); + let tls = config.tls_config.load(); + let tls = tls.as_deref(); let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 74b48a1bea..7c1a6206c1 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -17,7 +17,8 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::error::ErrorKind; use crate::intern::{BranchIdInt, ProjectIdInt}; use crate::metrics::{ - ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting, + ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol, + Waiting, }; use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra}; use crate::types::{DbName, EndpointId, RoleName}; @@ -55,12 +56,14 @@ struct RequestContextInner { dbname: Option, user: Option, application: Option, + user_agent: Option, error_kind: Option, pub(crate) auth_method: Option, jwt_issuer: Option, success: bool, pub(crate) cold_start_info: ColdStartInfo, pg_options: Option, + testodrome_query_id: Option, // extra // This sender is here to keep the request monitoring channel open while requests are taking place. @@ -100,6 +103,7 @@ impl Clone for RequestContext { dbname: inner.dbname.clone(), user: inner.user.clone(), application: inner.application.clone(), + user_agent: inner.user_agent.clone(), error_kind: inner.error_kind, auth_method: inner.auth_method.clone(), jwt_issuer: inner.jwt_issuer.clone(), @@ -107,6 +111,7 @@ impl Clone for RequestContext { rejected: inner.rejected, cold_start_info: inner.cold_start_info, pg_options: inner.pg_options.clone(), + testodrome_query_id: inner.testodrome_query_id.clone(), sender: None, disconnect_sender: None, @@ -149,6 +154,7 @@ impl RequestContext { dbname: None, user: None, application: None, + user_agent: None, error_kind: None, auth_method: None, jwt_issuer: None, @@ -156,6 +162,7 @@ impl RequestContext { rejected: None, cold_start_info: ColdStartInfo::Unknown, pg_options: None, + testodrome_query_id: None, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), @@ -206,6 +213,19 @@ impl RequestContext { this.set_dbname(dbname.into()); } + // Try to get testodrome_query_id directly from parameters + if let Some(options_str) = options.get("options") { + // If not found directly, try to extract it from the options string + for option in options_str.split_whitespace() { + if option.starts_with("neon_query_id:") { + if let Some(value) = option.strip_prefix("neon_query_id:") { + this.set_testodrome_id(value.to_string()); + break; + } + } + } + } + this.pg_options = Some(options); } @@ -245,6 +265,20 @@ impl RequestContext { .set_user(user); } + pub(crate) fn set_user_agent(&self, user_agent: Option) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_user_agent(user_agent); + } + + pub(crate) fn set_testodrome_id(&self, query_id: String) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_testodrome_id(query_id); + } + pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) { let mut this = self.0.try_lock().expect("should not deadlock"); this.auth_method = Some(auth_method); @@ -336,6 +370,22 @@ impl RequestContext { } } + pub(crate) fn get_proxy_latency(&self) -> LatencyAccumulated { + self.0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .accumulated() + } + + pub(crate) fn get_testodrome_id(&self) -> Option { + self.0 + .try_lock() + .expect("should not deadlock") + .testodrome_query_id + .clone() + } + pub(crate) fn success(&self) { self.0 .try_lock() @@ -384,6 +434,10 @@ impl RequestContextInner { } } + fn set_user_agent(&mut self, user_agent: Option) { + self.user_agent = user_agent; + } + fn set_dbname(&mut self, dbname: DbName) { self.dbname = Some(dbname); } @@ -393,6 +447,10 @@ impl RequestContextInner { self.user = Some(user); } + fn set_testodrome_id(&mut self, query_id: String) { + self.testodrome_query_id = Some(query_id); + } + fn has_private_peer_addr(&self) -> bool { match self.conn_info.addr.ip() { IpAddr::V4(ip) => ip.is_private(), diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index f029327266..bfab5f34f9 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -82,6 +82,7 @@ pub(crate) struct RequestData { peer_addr: String, username: Option, application_name: Option, + user_agent: Option, endpoint_id: Option, database: Option, project: Option, @@ -128,6 +129,7 @@ impl From<&RequestContextInner> for RequestData { timestamp: value.first_packet.naive_utc(), username: value.user.as_deref().map(String::from), application_name: value.application.as_deref().map(String::from), + user_agent: value.user_agent.as_deref().map(String::from), endpoint_id: value.endpoint_id.as_deref().map(String::from), database: value.dbname.as_deref().map(String::from), project: value.project.as_deref().map(String::from), @@ -522,6 +524,7 @@ mod tests { .unwrap() .naive_utc(), application_name: Some("test".to_owned()), + user_agent: Some("test-user-agent".to_owned()), username: Some(hex::encode(rng.r#gen::<[u8; 4]>())), endpoint_id: Some(hex::encode(rng.r#gen::<[u8; 16]>())), database: Some(hex::encode(rng.r#gen::<[u8; 16]>())), @@ -610,15 +613,15 @@ mod tests { assert_eq!( file_stats, [ - (1313105, 3, 6000), - (1313094, 3, 6000), - (1313153, 3, 6000), - (1313110, 3, 6000), - (1313246, 3, 6000), - (1313083, 3, 6000), - (1312877, 3, 6000), - (1313112, 3, 6000), - (438020, 1, 2000) + (1313953, 3, 6000), + (1313942, 3, 6000), + (1314001, 3, 6000), + (1313958, 3, 6000), + (1314094, 3, 6000), + (1313931, 3, 6000), + (1313725, 3, 6000), + (1313960, 3, 6000), + (438318, 1, 2000) ] ); @@ -650,11 +653,11 @@ mod tests { assert_eq!( file_stats, [ - (1204324, 5, 10000), - (1204048, 5, 10000), - (1204349, 5, 10000), - (1204334, 5, 10000), - (1204588, 5, 10000) + (1205810, 5, 10000), + (1205534, 5, 10000), + (1205835, 5, 10000), + (1205820, 5, 10000), + (1206074, 5, 10000) ] ); @@ -679,15 +682,15 @@ mod tests { assert_eq!( file_stats, [ - (1313105, 3, 6000), - (1313094, 3, 6000), - (1313153, 3, 6000), - (1313110, 3, 6000), - (1313246, 3, 6000), - (1313083, 3, 6000), - (1312877, 3, 6000), - (1313112, 3, 6000), - (438020, 1, 2000) + (1313953, 3, 6000), + (1313942, 3, 6000), + (1314001, 3, 6000), + (1313958, 3, 6000), + (1314094, 3, 6000), + (1313931, 3, 6000), + (1313725, 3, 6000), + (1313960, 3, 6000), + (438318, 1, 2000) ] ); @@ -724,7 +727,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(658014, 2, 3001), (657728, 2, 3000), (657524, 2, 2999)] + [(658584, 2, 3001), (658298, 2, 3000), (658094, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index 977fcf4727..2765aaa462 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -1,5 +1,7 @@ //! Production console backend. +use std::net::IpAddr; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -274,11 +276,27 @@ impl NeonControlPlaneClient { Some(x) => x, }; + let host_addr = IpAddr::from_str(host).ok(); + + let ssl_mode = match &body.server_name { + Some(_) => SslMode::Require, + None => SslMode::Disable, + }; + let host_name = match body.server_name { + Some(host) => host, + None => host.to_owned(), + }; + // Don't set anything but host and port! This config will be cached. // We'll set username and such later using the startup message. // TODO: add more type safety (in progress). - let mut config = compute::ConnCfg::new(host.to_owned(), port); - config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + let mut config = compute::ConnCfg::new(host_name, port); + + if let Some(addr) = host_addr { + config.set_host_addr(addr); + } + + config.ssl_mode(ssl_mode); let node = NodeInfo { config, diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index 7da5464aa5..ee722e839e 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -1,5 +1,6 @@ //! Mock console backend which relies on a user-provided postgres instance. +use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; @@ -167,10 +168,22 @@ impl MockControlPlane { } async fn do_wake_compute(&self) -> Result { - let mut config = compute::ConnCfg::new( - self.endpoint.host_str().unwrap_or("localhost").to_owned(), - self.endpoint.port().unwrap_or(5432), - ); + let port = self.endpoint.port().unwrap_or(5432); + let mut config = match self.endpoint.host_str() { + None => { + let mut config = compute::ConnCfg::new("localhost".to_string(), port); + config.set_host_addr(IpAddr::V4(Ipv4Addr::LOCALHOST)); + config + } + Some(host) => { + let mut config = compute::ConnCfg::new(host.to_string(), port); + if let Ok(addr) = IpAddr::from_str(host) { + config.set_host_addr(addr); + } + config + } + }; + config.ssl_mode(postgres_client::config::SslMode::Disable); let node = NodeInfo { @@ -179,6 +192,7 @@ impl MockControlPlane { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), + compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, }; diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 8d6b2e96f5..ec4554eab5 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -2,6 +2,7 @@ use std::fmt::{self, Display}; use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; +use smol_str::SmolStr; use crate::auth::IpPattern; use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; @@ -239,6 +240,7 @@ pub(crate) struct GetEndpointAccessControl { #[derive(Debug, Deserialize)] pub(crate) struct WakeCompute { pub(crate) address: Box, + pub(crate) server_name: Option, pub(crate) aux: MetricsAuxInfo, } @@ -312,6 +314,9 @@ pub(crate) struct MetricsAuxInfo { pub(crate) endpoint_id: EndpointIdInt, pub(crate) project_id: ProjectIdInt, pub(crate) branch_id: BranchIdInt, + // note: we don't use interned strings for compute IDs. + // they churn too quickly and we have no way to clean up interned strings. + pub(crate) compute_id: SmolStr, #[serde(default)] pub(crate) cold_start_info: ColdStartInfo, } @@ -378,6 +383,7 @@ mod tests { "endpoint_id": "endpoint", "project_id": "project", "branch_id": "branch", + "compute_id": "compute", "cold_start_info": "unknown", }) } diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 3c34918d84..454fe81357 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,9 +1,11 @@ use std::cell::{Cell, RefCell}; use std::collections::HashMap; use std::hash::BuildHasher; -use std::{env, io}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::{array, env, fmt, io}; use chrono::{DateTime, Utc}; +use indexmap::IndexSet; use opentelemetry::trace::TraceContextExt; use scopeguard::defer; use serde::ser::{SerializeMap, Serializer}; @@ -17,6 +19,7 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields}; use tracing_subscriber::layer::{Context, Layer}; use tracing_subscriber::prelude::*; use tracing_subscriber::registry::{LookupSpan, SpanRef}; +use try_lock::TryLock; /// Initialize logging and OpenTelemetry tracing and exporter. /// @@ -43,16 +46,17 @@ pub async fn init() -> anyhow::Result { .expect("this should be a valid filter directive"), ); - let otlp_layer = tracing_utils::init_tracing("proxy").await; + let otlp_layer = + tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default()).await; let json_log_layer = if logfmt == LogFormat::Json { - Some(JsonLoggingLayer { - clock: RealClock, - skipped_field_indices: papaya::HashMap::default(), - writer: StderrWriter { + Some(JsonLoggingLayer::new( + RealClock, + StderrWriter { stderr: std::io::stderr(), }, - }) + ["request_id", "session_id", "conn_id"], + )) } else { None }; @@ -191,13 +195,39 @@ thread_local! { } /// Implements tracing layer to handle events specific to logging. -struct JsonLoggingLayer { +struct JsonLoggingLayer { clock: C, skipped_field_indices: papaya::HashMap, + callsite_ids: papaya::HashMap, writer: W, + // We use a const generic and arrays to bypass one heap allocation. + extract_fields: IndexSet<&'static str>, + _marker: std::marker::PhantomData<[&'static str; F]>, } -impl Layer for JsonLoggingLayer +impl JsonLoggingLayer { + fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self { + JsonLoggingLayer { + clock, + skipped_field_indices: papaya::HashMap::default(), + callsite_ids: papaya::HashMap::default(), + writer, + extract_fields: IndexSet::from_iter(extract_fields), + _marker: std::marker::PhantomData, + } + } + + #[inline] + fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId { + *self + .callsite_ids + .pin() + .get_or_insert_with(cs, CallsiteId::next) + } +} + +impl Layer + for JsonLoggingLayer where S: Subscriber + for<'a> LookupSpan<'a>, { @@ -211,7 +241,14 @@ where let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| { if entered.get() { let mut formatter = EventFormatter::new(); - formatter.format(now, event, &ctx, &self.skipped_field_indices)?; + formatter.format::( + now, + event, + &ctx, + &self.skipped_field_indices, + &self.callsite_ids, + &self.extract_fields, + )?; self.writer.make_writer().write_all(formatter.buffer()) } else { entered.set(true); @@ -219,7 +256,14 @@ where EVENT_FORMATTER.with_borrow_mut(move |formatter| { formatter.reset(); - formatter.format(now, event, &ctx, &self.skipped_field_indices)?; + formatter.format::( + now, + event, + &ctx, + &self.skipped_field_indices, + &self.callsite_ids, + &self.extract_fields, + )?; self.writer.make_writer().write_all(formatter.buffer()) }) } @@ -246,10 +290,13 @@ where let span = ctx.span(id).expect("span must exist"); let fields = SpanFields::default(); fields.record_fields(attrs); + // This could deadlock when there's a panic somewhere in the tracing // event handling and a read or write guard is still held. This includes // the OTel subscriber. - span.extensions_mut().insert(fields); + let mut exts = span.extensions_mut(); + + exts.insert(fields); } fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) { @@ -265,6 +312,7 @@ where /// wins. fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest { if !metadata.is_event() { + self.callsite_id(metadata.callsite()); // Must not be never because we wouldn't get trace and span data. return Interest::always(); } @@ -297,6 +345,26 @@ where } } +#[derive(Copy, Clone, Debug, Default)] +#[repr(transparent)] +struct CallsiteId(u32); + +impl CallsiteId { + #[inline] + fn next() -> Self { + // Start at 1 to reserve 0 for default. + static COUNTER: AtomicU32 = AtomicU32::new(1); + CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed)) + } +} + +impl fmt::Display for CallsiteId { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + /// Stores span field values recorded during the spans lifetime. #[derive(Default)] struct SpanFields { @@ -448,12 +516,14 @@ impl EventFormatter { self.logline_buffer.clear(); } - fn format( + fn format( &mut self, now: DateTime, event: &Event<'_>, ctx: &Context<'_, S>, skipped_field_indices: &papaya::HashMap, + callsite_ids: &papaya::HashMap, + extract_fields: &IndexSet<&'static str>, ) -> io::Result<()> where S: Subscriber + for<'a> LookupSpan<'a>, @@ -485,6 +555,7 @@ impl EventFormatter { event.record(&mut message_extractor); let mut serializer = message_extractor.into_serializer()?; + // Direct message fields. let mut fields_present = FieldsPresent(false, skipped_field_indices); event.record(&mut fields_present); if fields_present.0 { @@ -494,7 +565,16 @@ impl EventFormatter { )?; } + let spans = SerializableSpans { + ctx, + callsite_ids, + extract: ExtractedSpanFields::<'_, F>::new(extract_fields), + }; + serializer.serialize_entry("spans", &spans)?; + + // TODO: thread-local cache? let pid = std::process::id(); + // Skip adding pid 1 to reduce noise for services running in containers. if pid != 1 { serializer.serialize_entry("process_id", &pid)?; } @@ -514,6 +594,7 @@ impl EventFormatter { serializer.serialize_entry("target", meta.target())?; + // Skip adding module if it's the same as target. if let Some(module) = meta.module_path() { if module != meta.target() { serializer.serialize_entry("module", module)?; @@ -540,7 +621,10 @@ impl EventFormatter { } } - serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?; + if spans.extract.has_values() { + // TODO: add fields from event, too? + serializer.serialize_entry("extract", &spans.extract)?; + } serializer.end() }; @@ -818,15 +902,20 @@ impl tracing::field::Visit for MessageFieldSkipper< } } -/// Serializes the span stack from root to leaf (parent of event) enumerated -/// inside an object where the keys are just the number padded with zeroes -/// to retain sorting order. -// The object is necessary because Loki cannot flatten arrays. -struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>) +/// Serializes the span stack from root to leaf (parent of event) as object +/// with the span names as keys. To prevent collision we append a numberic value +/// to the name. Also, collects any span fields we're interested in. Last one +/// wins. +struct SerializableSpans<'a, 'ctx, Span, const F: usize> where - Span: Subscriber + for<'lookup> LookupSpan<'lookup>; + Span: Subscriber + for<'lookup> LookupSpan<'lookup>, +{ + ctx: &'a Context<'ctx, Span>, + callsite_ids: &'a papaya::HashMap, + extract: ExtractedSpanFields<'a, F>, +} -impl serde::ser::Serialize for SerializableSpanStack<'_, '_, Span> +impl serde::ser::Serialize for SerializableSpans<'_, '_, Span, F> where Span: Subscriber + for<'lookup> LookupSpan<'lookup>, { @@ -836,9 +925,24 @@ where { let mut serializer = serializer.serialize_map(None)?; - if let Some(leaf_span) = self.0.lookup_current() { - for (i, span) in leaf_span.scope().from_root().enumerate() { - serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?; + if let Some(leaf_span) = self.ctx.lookup_current() { + for span in leaf_span.scope().from_root() { + // Append a numeric callsite ID to the span name to keep the name unique + // in the JSON object. + let cid = self + .callsite_ids + .pin() + .get(&span.metadata().callsite()) + .copied() + .unwrap_or_default(); + + // Loki turns the # into an underscore during field name concatenation. + serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?; + + serializer.serialize_value(&SerializableSpanFields { + span: &span, + extract: &self.extract, + })?; } } @@ -846,28 +950,79 @@ where } } -/// Serializes a single span. Include the span ID, name and its fields as -/// recorded up to this point. -struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>) -where - Span: for<'lookup> LookupSpan<'lookup>; - -impl serde::ser::Serialize for SerializableSpan<'_, '_, Span> +/// Serializes the span fields as object. +struct SerializableSpanFields<'a, 'span, Span, const F: usize> where Span: for<'lookup> LookupSpan<'lookup>, { - fn serialize(&self, serializer: Ser) -> Result + span: &'a SpanRef<'span, Span>, + extract: &'a ExtractedSpanFields<'a, F>, +} + +impl serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F> +where + Span: for<'lookup> LookupSpan<'lookup>, +{ + fn serialize(&self, serializer: S) -> Result where - Ser: serde::ser::Serializer, + S: serde::ser::Serializer, { let mut serializer = serializer.serialize_map(None)?; - // TODO: the span ID is probably only useful for debugging tracing. - serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?; - serializer.serialize_entry("span_name", self.0.metadata().name())?; - let ext = self.0.extensions(); + let ext = self.span.extensions(); if let Some(data) = ext.get::() { - for (key, value) in &data.fields.pin() { + for (name, value) in &data.fields.pin() { + serializer.serialize_entry(name, value)?; + // TODO: replace clone with reference, if possible. + self.extract.set(name, value.clone()); + } + } + + serializer.end() + } +} + +struct ExtractedSpanFields<'a, const F: usize> { + names: &'a IndexSet<&'static str>, + // TODO: replace TryLock with something local thread and interior mutability. + // serde API doesn't let us use `mut`. + values: TryLock<([Option; F], bool)>, +} + +impl<'a, const F: usize> ExtractedSpanFields<'a, F> { + fn new(names: &'a IndexSet<&'static str>) -> Self { + ExtractedSpanFields { + names, + values: TryLock::new((array::from_fn(|_| Option::default()), false)), + } + } + + #[inline] + fn set(&self, name: &'static str, value: serde_json::Value) { + if let Some((index, _)) = self.names.get_full(name) { + let mut fields = self.values.try_lock().expect("thread-local use"); + fields.0[index] = Some(value); + fields.1 = true; + } + } + + #[inline] + fn has_values(&self) -> bool { + self.values.try_lock().expect("thread-local use").1 + } +} + +impl serde::ser::Serialize for ExtractedSpanFields<'_, F> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::ser::Serializer, + { + let mut serializer = serializer.serialize_map(None)?; + + let values = self.values.try_lock().expect("thread-local use"); + for (i, value) in values.0.iter().enumerate() { + if let Some(value) = value { + let key = self.names[i]; serializer.serialize_entry(key, value)?; } } @@ -879,6 +1034,7 @@ where #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { + use std::marker::PhantomData; use std::sync::{Arc, Mutex, MutexGuard}; use assert_json_diff::assert_json_eq; @@ -927,14 +1083,17 @@ mod tests { let log_layer = JsonLoggingLayer { clock: clock.clone(), skipped_field_indices: papaya::HashMap::default(), + callsite_ids: papaya::HashMap::default(), writer: buffer.clone(), + extract_fields: IndexSet::from_iter(["x"]), + _marker: PhantomData::<[&'static str; 1]>, }; let registry = tracing_subscriber::Registry::default().with(log_layer); tracing::subscriber::with_default(registry, || { - info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| { - info_span!("span2").in_scope(|| { + info_span!("some_span", x = 24).in_scope(|| { + info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| { tracing::error!( a = 1, a = 2, @@ -960,16 +1119,16 @@ mod tests { "a": 3, }, "spans": { - "00":{ - "span_id": "0000000000000001", - "span_name": "span1", - "x": 42, + "some_span#1":{ + "x": 24, }, - "01": { - "span_id": "0000000000000002", - "span_name": "span2", + "some_span#2": { + "x": 42, } }, + "extract": { + "x": 42, + }, "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(), "target": "proxy::logging::tests", "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(), diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index db1f096de1..e5fc0b724b 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -30,7 +30,16 @@ pub struct Metrics { static SELF: OnceLock = OnceLock::new(); impl Metrics { pub fn install(thread_pool: Arc) { - SELF.set(Metrics::new(thread_pool)) + let mut metrics = Metrics::new(thread_pool); + + metrics.proxy.errors_total.init_all_dense(); + metrics.proxy.redis_errors_total.init_all_dense(); + metrics.proxy.redis_events_count.init_all_dense(); + metrics.proxy.retries_metric.init_all_dense(); + metrics.proxy.invalid_endpoints_total.init_all_dense(); + metrics.proxy.connection_failures_total.init_all_dense(); + + SELF.set(metrics) .ok() .expect("proxy metrics must not be installed more than once"); } @@ -394,21 +403,34 @@ pub enum RedisMsgKind { HDel, } -#[derive(Default)] -struct Accumulated { +#[derive(Default, Clone)] +pub struct LatencyAccumulated { cplane: time::Duration, client: time::Duration, compute: time::Duration, retry: time::Duration, } +impl std::fmt::Display for LatencyAccumulated { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "client: {}, cplane: {}, compute: {}, retry: {}", + self.client.as_micros(), + self.cplane.as_micros(), + self.compute.as_micros(), + self.retry.as_micros() + ) + } +} + pub struct LatencyTimer { // time since the stopwatch was started start: time::Instant, // time since the stopwatch was stopped stop: Option, // accumulated time on the stopwatch - accumulated: Accumulated, + accumulated: LatencyAccumulated, // label data protocol: Protocol, cold_start_info: ColdStartInfo, @@ -422,7 +444,7 @@ impl LatencyTimer { Self { start: time::Instant::now(), stop: None, - accumulated: Accumulated::default(), + accumulated: LatencyAccumulated::default(), protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified @@ -435,7 +457,7 @@ impl LatencyTimer { Self { start: time::Instant::now(), stop: None, - accumulated: Accumulated::default(), + accumulated: LatencyAccumulated::default(), protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified @@ -465,6 +487,10 @@ impl LatencyTimer { // success self.outcome = ConnectOutcome::Success; } + + pub fn accumulated(&self) -> LatencyAccumulated { + self.accumulated.clone() + } } #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] @@ -511,7 +537,7 @@ impl Drop for LatencyTimer { duration.saturating_sub(accumulated_total).as_secs_f64(), ); - // Exclude client cplane, compue communication from the accumulated time. + // Exclude client, cplane, compute communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane + self.accumulated.compute; metric.observe( @@ -524,7 +550,7 @@ impl Drop for LatencyTimer { duration.saturating_sub(accumulated_total).as_secs_f64(), ); - // Exclude client cplane, compue, retry communication from the accumulated time. + // Exclude client, cplane, compute, retry communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane + self.accumulated.compute diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index b8b39fa121..e013fbbe2e 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -81,7 +81,10 @@ impl ConnectMechanism for TcpMechanism<'_> { type ConnectError = compute::ConnectionError; type Error = compute::ConnectionError; - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + pid = tracing::field::Empty, + compute_id = tracing::field::Empty + ))] async fn connect_once( &self, ctx: &RequestContext, diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 955f754497..2582e4c069 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -114,7 +114,7 @@ pub(crate) async fn handshake( let mut read_buf = read_buf.reader(); let mut res = Ok(()); - let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config()) + let accept = tokio_rustls::TlsAcceptor::from(tls.pg_config.clone()) .accept_with(raw, |session| { // push the early data to the tls session while !read_buf.get_ref().is_empty() { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 0c6d352600..2e7d332a8b 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -278,7 +278,8 @@ pub(crate) async fn handle_client( let proto = ctx.protocol(); let request_gauge = metrics.connection_requests.guard(proto); - let tls = config.tls_config.as_ref(); + let tls = config.tls_config.load(); + let tls = tls.as_deref(); let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index 23b9897155..c100b8d716 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -10,7 +10,7 @@ use crate::config::ComputeConfig; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; use crate::stream::Stream; -use crate::usage_metrics::{Ids, MetricCounterRecorder, TrafficDirection, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] @@ -24,7 +24,6 @@ pub(crate) async fn proxy_pass( let usage_tx = USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, - direction: TrafficDirection::Egress, private_link_id, }); @@ -47,6 +46,7 @@ pub(crate) async fn proxy_pass( |cnt| { // Number of bytes the client sent to the compute node (inbound). metrics.get_metric(m_recv).inc_by(cnt as u64); + usage_tx.record_ingress(cnt as u64); }, ); diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 171f539b1e..2c3e70138d 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -96,16 +96,18 @@ fn generate_tls_config<'a>( .with_safe_default_protocol_versions() .context("ring should support the default protocol versions")? .with_no_client_auth() - .with_single_cert(vec![cert.clone()], key.clone_key())? - .into(); + .with_single_cert(vec![cert.clone()], key.clone_key())?; let mut cert_resolver = CertResolver::new(); cert_resolver.add_cert(key, vec![cert], true)?; let common_names = cert_resolver.get_common_names(); + let config = Arc::new(config); + TlsConfig { - config, + http_config: config.clone(), + pg_config: config, common_names, cert_resolver: Arc::new(cert_resolver), } @@ -555,6 +557,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), + compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, }; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 72029102e0..e40aa024a8 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -1,4 +1,5 @@ use std::io; +use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; use std::time::Duration; @@ -6,11 +7,15 @@ use async_trait::async_trait; use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; use jose_jwk::jose_b64; +use postgres_client::config::SslMode; use rand::rngs::OsRng; +use rustls::pki_types::{DnsName, ServerName}; use tokio::net::{TcpStream, lookup_host}; +use tokio_rustls::TlsConnector; use tracing::field::display; use tracing::{debug, info}; +use super::AsyncRW; use super::conn_pool::poll_client; use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool}; use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client}; @@ -190,7 +195,11 @@ impl PoolingBackend { // Wake up the destination if needed. Code here is a bit involved because // we reuse the code from the usual proxy and we need to prepare few structures // that this code expects. - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + pid = tracing::field::Empty, + compute_id = tracing::field::Empty, + conn_id = tracing::field::Empty, + ))] pub(crate) async fn connect_to_compute( &self, ctx: &RequestContext, @@ -229,7 +238,10 @@ impl PoolingBackend { } // Wake up the destination if needed - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + compute_id = tracing::field::Empty, + conn_id = tracing::field::Empty, + ))] pub(crate) async fn connect_to_local_proxy( &self, ctx: &RequestContext, @@ -276,7 +288,10 @@ impl PoolingBackend { /// # Panics /// /// Panics if called with a non-local_proxy backend. - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + pid = tracing::field::Empty, + conn_id = tracing::field::Empty, + ))] pub(crate) async fn connect_to_local_postgres( &self, ctx: &RequestContext, @@ -552,6 +567,15 @@ impl ConnectMechanism for TokioMechanism { let (client, connection) = permit.release_result(res)?; tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); + tracing::Span::current().record( + "compute_id", + tracing::field::display(&node_info.aux.compute_id), + ); + + if let Some(query_id) = ctx.get_testodrome_id() { + info!("latency={}, query_id={}", ctx.get_proxy_latency(), query_id); + } + Ok(poll_client( self.pool.clone(), ctx, @@ -587,16 +611,32 @@ impl ConnectMechanism for HyperMechanism { node_info: &CachedNodeInfo, config: &ComputeConfig, ) -> Result { + let host_addr = node_info.config.get_host_addr(); let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + let tls = if node_info.config.get_ssl_mode() == SslMode::Disable { + None + } else { + Some(&config.tls) + }; + let port = node_info.config.get_port(); - let res = connect_http2(&host, port, config.timeout).await; + let res = connect_http2(host_addr, &host, port, config.timeout, tls).await; drop(pause); let (client, connection) = permit.release_result(res)?; + tracing::Span::current().record( + "compute_id", + tracing::field::display(&node_info.aux.compute_id), + ); + + if let Some(query_id) = ctx.get_testodrome_id() { + info!("latency={}, query_id={}", ctx.get_proxy_latency(), query_id); + } + Ok(poll_http2_client( self.pool.clone(), ctx, @@ -612,18 +652,22 @@ impl ConnectMechanism for HyperMechanism { } async fn connect_http2( + host_addr: Option, host: &str, port: u16, timeout: Duration, + tls: Option<&Arc>, ) -> Result<(http_conn_pool::Send, http_conn_pool::Connect), LocalProxyConnError> { - // assumption: host is an ip address so this should not actually perform any requests. - // todo: add that assumption as a guarantee in the control-plane API. - let mut addrs = lookup_host((host, port)) - .await - .map_err(LocalProxyConnError::Io)?; - + let addrs = match host_addr { + Some(addr) => vec![SocketAddr::new(addr, port)], + None => lookup_host((host, port)) + .await + .map_err(LocalProxyConnError::Io)? + .collect(), + }; let mut last_err = None; + let mut addrs = addrs.into_iter(); let stream = loop { let Some(addr) = addrs.next() else { return Err(last_err.unwrap_or_else(|| { @@ -651,6 +695,20 @@ async fn connect_http2( } }; + let stream = if let Some(tls) = tls { + let host = DnsName::try_from(host) + .map_err(io::Error::other) + .map_err(LocalProxyConnError::Io)? + .to_owned(); + let stream = TlsConnector::from(tls.clone()) + .connect(ServerName::DnsName(host), stream) + .await + .map_err(LocalProxyConnError::Io)?; + Box::pin(stream) as AsyncRW + } else { + Box::pin(stream) as AsyncRW + }; + let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new()) .timer(TokioTimer::new()) .keep_alive_interval(Duration::from_secs(20)) diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 6a9089fc2a..516d474a11 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -221,6 +221,7 @@ mod tests { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), + compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, conn_id: uuid::Uuid::new_v4(), diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 933204994b..77b548cc43 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -22,7 +22,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::protocol2::ConnectionInfoExtra; use crate::types::{DbName, EndpointCacheKey, RoleName}; -use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; #[derive(Debug, Clone)] pub(crate) struct ConnInfo { @@ -639,11 +639,7 @@ impl Client { (&mut inner.inner, Discard { conn_info, pool }) } - pub(crate) fn metrics( - &self, - direction: TrafficDirection, - ctx: &RequestContext, - ) -> Arc { + pub(crate) fn metrics(&self, ctx: &RequestContext) -> Arc { let aux = &self .inner .as_ref() @@ -659,7 +655,6 @@ impl Client { USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, - direction, private_link_id, }) } diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 338a79b4b3..1c6574e57e 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -6,9 +6,9 @@ use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; use smol_str::ToSmolStr; -use tokio::net::TcpStream; use tracing::{Instrument, debug, error, info, info_span}; +use super::AsyncRW; use super::backend::HttpConnError; use super::conn_pool_lib::{ ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry, @@ -19,11 +19,10 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::protocol2::ConnectionInfoExtra; use crate::types::EndpointCacheKey; -use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; pub(crate) type Send = http2::SendRequest; -pub(crate) type Connect = - http2::Connection, hyper::body::Incoming, TokioExecutor>; +pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; #[derive(Clone)] pub(crate) struct ClientDataHttp(); @@ -266,11 +265,7 @@ impl Client { Self { inner } } - pub(crate) fn metrics( - &self, - direction: TrafficDirection, - ctx: &RequestContext, - ) -> Arc { + pub(crate) fn metrics(&self, ctx: &RequestContext) -> Arc { let aux = &self.inner.aux; let private_link_id = match ctx.extra() { @@ -282,7 +277,6 @@ impl Client { USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, - direction, private_link_id, }) } diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 8426a0810e..c958d077fc 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -35,6 +35,7 @@ use super::conn_pool_lib::{ Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, DbUserConn, EndpointConnPool, }; +use super::sql_over_http::SqlOverHttpError; use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; @@ -274,18 +275,23 @@ pub(crate) fn poll_client( } impl ClientInnerCommon { - pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> { + pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), SqlOverHttpError> { if let ClientDataEnum::Local(local_data) = &mut self.data { local_data.jti += 1; let token = resign_jwt(&local_data.key, payload, local_data.jti)?; - // discard all cannot run in a transaction. must be executed alone. - self.inner.batch_execute("discard all").await?; + self.inner + .discard_all() + .await + .map_err(SqlOverHttpError::InternalPostgres)?; // initiates the auth session // this is safe from query injections as the jwt format free of any escape characters. let query = format!("select auth.jwt_session_init('{token}')"); - self.inner.batch_execute(&query).await?; + self.inner + .batch_execute(&query) + .await + .map_err(SqlOverHttpError::InternalPostgres)?; let pid = self.inner.get_process_id(); info!(pid, jti = local_data.jti, "user session state init"); diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index dd0fb9c5b4..00164d631a 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -19,6 +19,7 @@ use std::pin::{Pin, pin}; use std::sync::Arc; use anyhow::Context; +use arc_swap::ArcSwapOption; use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; @@ -117,18 +118,7 @@ pub async fn task_main( auth_backend, endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); - let tls_acceptor: Arc = match config.tls_config.as_ref() { - Some(config) => { - let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config()); - // prefer http2, but support http/1.1 - tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; - Arc::new(tls_server_config) - } - None => { - warn!("TLS config is missing"); - Arc::new(NoTls) - } - }; + let tls_acceptor: Arc = Arc::new(&config.tls_config); let connections = tokio_util::task::task_tracker::TaskTracker::new(); connections.close(); // allows `connections.wait to complete` @@ -216,22 +206,20 @@ pub(crate) type AsyncRW = Pin>; #[async_trait] trait MaybeTlsAcceptor: Send + Sync + 'static { - async fn accept(self: Arc, conn: ChainRW) -> std::io::Result; + async fn accept(&self, conn: ChainRW) -> std::io::Result; } #[async_trait] -impl MaybeTlsAcceptor for rustls::ServerConfig { - async fn accept(self: Arc, conn: ChainRW) -> std::io::Result { - Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?)) - } -} - -struct NoTls; - -#[async_trait] -impl MaybeTlsAcceptor for NoTls { - async fn accept(self: Arc, conn: ChainRW) -> std::io::Result { - Ok(Box::pin(conn)) +impl MaybeTlsAcceptor for &'static ArcSwapOption { + async fn accept(&self, conn: ChainRW) -> std::io::Result { + match &*self.load() { + Some(config) => Ok(Box::pin( + TlsAcceptor::from(config.http_config.clone()) + .accept(conn) + .await?, + )), + None => Ok(Box::pin(conn)), + } } } @@ -438,6 +426,23 @@ async fn request_handler( &config.region, ); + ctx.set_user_agent( + request + .headers() + .get(hyper::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .map(Into::into), + ); + + let testodrome_id = request + .headers() + .get("X-Neon-Query-ID") + .map(|value| value.to_str().unwrap_or_default().to_string()); + + if let Some(query_id) = testodrome_id { + ctx.set_testodrome_id(query_id); + } + let span = ctx.span(); info!(parent: &span, "performing websocket upgrade"); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 8babfb5cd2..10e378a18d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -42,7 +42,7 @@ use crate::metrics::{HttpDirection, Metrics}; use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; -use crate::usage_metrics::{MetricCounter, MetricCounterRecorder, TrafficDirection}; +use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] @@ -228,6 +228,13 @@ fn get_conn_info( } } + ctx.set_user_agent( + headers + .get(hyper::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .map(Into::into), + ); + let user_info = ComputeUserInfo { endpoint, user: username, @@ -405,8 +412,12 @@ pub(crate) enum SqlOverHttpError { ResponseTooLarge(usize), #[error("invalid isolation level")] InvalidIsolationLevel, + /// for queries our customers choose to run #[error("{0}")] - Postgres(#[from] postgres_client::Error), + Postgres(#[source] postgres_client::Error), + /// for queries we choose to run + #[error("{0}")] + InternalPostgres(#[source] postgres_client::Error), #[error("{0}")] JsonConversion(#[from] JsonConversionError), #[error("{0}")] @@ -422,6 +433,13 @@ impl ReportableError for SqlOverHttpError { SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User, SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, SqlOverHttpError::Postgres(p) => p.get_error_kind(), + SqlOverHttpError::InternalPostgres(p) => { + if p.as_db_error().is_some() { + ErrorKind::Service + } else { + ErrorKind::Compute + } + } SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, SqlOverHttpError::Cancelled(c) => c.get_error_kind(), } @@ -437,6 +455,7 @@ impl UserFacingError for SqlOverHttpError { SqlOverHttpError::ResponseTooLarge(_) => self.to_string(), SqlOverHttpError::InvalidIsolationLevel => self.to_string(), SqlOverHttpError::Postgres(p) => p.to_string(), + SqlOverHttpError::InternalPostgres(p) => p.to_string(), SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(), SqlOverHttpError::Cancelled(_) => self.to_string(), } @@ -455,6 +474,7 @@ impl HttpCodeError for SqlOverHttpError { SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE, SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST, SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST, + SqlOverHttpError::InternalPostgres(_) => StatusCode::INTERNAL_SERVER_ERROR, SqlOverHttpError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR, SqlOverHttpError::Cancelled(_) => StatusCode::INTERNAL_SERVER_ERROR, } @@ -594,7 +614,9 @@ async fn handle_inner( &config.authentication_config, ctx, request.headers(), - config.tls_config.as_ref(), + // todo: race condition? + // we're unlikely to change the common names. + config.tls_config.load().as_deref(), )?; info!( user = conn_info.conn_info.user_info.user.as_str(), @@ -641,6 +663,7 @@ async fn handle_db_inner( let parsed_headers = HttpHeaders::try_parse(headers)?; + let mut request_len = 0; let fetch_and_process_request = Box::pin( async { let body = read_body_with_limit( @@ -649,6 +672,8 @@ async fn handle_db_inner( ) .await?; + request_len = body.len(); + Metrics::get() .proxy .http_conn_content_length_bytes @@ -664,16 +689,14 @@ async fn handle_db_inner( let authenticate_and_connect = Box::pin( async { let keys = match auth { - AuthData::Password(pw) => { - backend - .authenticate_with_password(ctx, &conn_info.user_info, &pw) - .await? - } - AuthData::Jwt(jwt) => { - backend - .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) - .await? - } + AuthData::Password(pw) => backend + .authenticate_with_password(ctx, &conn_info.user_info, &pw) + .await + .map_err(HttpConnError::AuthError)?, + AuthData::Jwt(jwt) => backend + .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) + .await + .map_err(HttpConnError::AuthError)?, }; let client = match keys.keys { @@ -696,7 +719,7 @@ async fn handle_db_inner( // not strictly necessary to mark success here, // but it's just insurance for if we forget it somewhere else ctx.success(); - Ok::<_, HttpConnError>(client) + Ok::<_, SqlOverHttpError>(client) } .map_err(SqlOverHttpError::from), ); @@ -745,7 +768,7 @@ async fn handle_db_inner( } }; - let metrics = client.metrics(TrafficDirection::Egress, ctx); + let metrics = client.metrics(ctx); let len = json_output.len(); let response = response @@ -761,6 +784,8 @@ async fn handle_db_inner( // count the egress bytes - we miss the TLS and header overhead but oh well... // moving this later in the stack is going to be a lot of effort and ehhhh metrics.record_egress(len as u64); + metrics.record_ingress(request_len as u64); + Metrics::get() .proxy .http_conn_content_length_bytes @@ -818,7 +843,7 @@ async fn handle_auth_broker_inner( .expect("all headers and params received via hyper should be valid for request"); // todo: map body to count egress - let _metrics = client.metrics(TrafficDirection::Egress, ctx); + let _metrics = client.metrics(ctx); Ok(client .inner @@ -842,7 +867,13 @@ impl QueryData { let cancel_token = inner.cancel_token(); let res = match select( - pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)), + pin!(query_to_json( + config, + &mut *inner, + self, + &mut 0, + parsed_headers + )), pin!(cancel.cancelled()), ) .await @@ -926,16 +957,20 @@ impl BatchQueryData { builder = builder.deferrable(true); } - let transaction = builder.start().await.inspect_err(|_| { - // if we cannot start a transaction, we should return immediately - // and not return to the pool. connection is clearly broken - discard.discard(); - })?; + let mut transaction = builder + .start() + .await + .inspect_err(|_| { + // if we cannot start a transaction, we should return immediately + // and not return to the pool. connection is clearly broken + discard.discard(); + }) + .map_err(SqlOverHttpError::Postgres)?; let json_output = match query_batch( config, cancel.child_token(), - &transaction, + &mut transaction, self, parsed_headers, ) @@ -943,11 +978,15 @@ impl BatchQueryData { { Ok(json_output) => { info!("commit"); - let status = transaction.commit().await.inspect_err(|_| { - // if we cannot commit - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - })?; + let status = transaction + .commit() + .await + .inspect_err(|_| { + // if we cannot commit - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + }) + .map_err(SqlOverHttpError::Postgres)?; discard.check_idle(status); json_output } @@ -962,11 +1001,15 @@ impl BatchQueryData { } Err(err) => { info!("rollback"); - let status = transaction.rollback().await.inspect_err(|_| { - // if we cannot rollback - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - })?; + let status = transaction + .rollback() + .await + .inspect_err(|_| { + // if we cannot rollback - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + }) + .map_err(SqlOverHttpError::Postgres)?; discard.check_idle(status); return Err(err); } @@ -979,7 +1022,7 @@ impl BatchQueryData { async fn query_batch( config: &'static HttpConfig, cancel: CancellationToken, - transaction: &Transaction<'_>, + transaction: &mut Transaction<'_>, queries: BatchQueryData, parsed_headers: HttpHeaders, ) -> Result { @@ -1017,7 +1060,7 @@ async fn query_batch( async fn query_to_json( config: &'static HttpConfig, - client: &T, + client: &mut T, data: QueryData, current_size: &mut usize, parsed_headers: HttpHeaders, @@ -1025,7 +1068,12 @@ async fn query_to_json( let query_start = Instant::now(); let query_params = data.params; - let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); + let mut row_stream = std::pin::pin!( + client + .query_raw_txt(&data.query, query_params) + .await + .map_err(SqlOverHttpError::Postgres)? + ); let query_acknowledged = Instant::now(); // Manually drain the stream into a vector to leave row_stream hanging @@ -1033,7 +1081,7 @@ async fn query_to_json( // big. let mut rows: Vec = Vec::new(); while let Some(row) = row_stream.next().await { - let row = row?; + let row = row.map_err(SqlOverHttpError::Postgres)?; *current_size += row.body_len(); rows.push(row); // we don't have a streaming response support yet so this is to prevent OOM @@ -1084,7 +1132,14 @@ async fn query_to_json( "dataTypeModifier": c.type_modifier(), "format": "text", })); - columns.push(client.get_type(c.type_oid()).await?); + + match client.get_type(c.type_oid()).await { + Ok(t) => columns.push(t), + Err(err) => { + tracing::warn!(?err, "unable to query type information"); + return Err(SqlOverHttpError::InternalPostgres(err)); + } + } } let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); @@ -1118,10 +1173,10 @@ enum Discard<'a> { } impl Client { - fn metrics(&self, direction: TrafficDirection, ctx: &RequestContext) -> Arc { + fn metrics(&self, ctx: &RequestContext) -> Arc { match self { - Client::Remote(client) => client.metrics(direction, ctx), - Client::Local(local_client) => local_client.metrics(direction, ctx), + Client::Remote(client) => client.metrics(ctx), + Client::Local(local_client) => local_client.metrics(ctx), } } diff --git a/proxy/src/tls/client_config.rs b/proxy/src/tls/client_config.rs index a2d695aae1..ce873e678e 100644 --- a/proxy/src/tls/client_config.rs +++ b/proxy/src/tls/client_config.rs @@ -1,17 +1,49 @@ +use std::env; +use std::io::Cursor; +use std::path::PathBuf; use std::sync::Arc; -use anyhow::bail; +use anyhow::{Context, bail}; use rustls::crypto::ring; -pub(crate) fn load_certs() -> anyhow::Result> { +/// We use an internal certificate authority when establishing a TLS connection with compute. +fn load_internal_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> { + let Some(ca_file) = env::var_os("NEON_INTERNAL_CA_FILE") else { + return Ok(()); + }; + let ca_file = PathBuf::from(ca_file); + + let ca = std::fs::read(&ca_file) + .with_context(|| format!("could not read CA from {}", ca_file.display()))?; + + for cert in rustls_pemfile::certs(&mut Cursor::new(&*ca)) { + store + .add(cert.context("could not parse internal CA certificate")?) + .context("could not parse internal CA certificate")?; + } + + Ok(()) +} + +/// For console redirect proxy, we need to establish a connection to compute via pg-sni-router. +/// pg-sni-router needs TLS and uses a Let's Encrypt signed certificate, so we +/// load certificates from our native store. +fn load_native_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> { let der_certs = rustls_native_certs::load_native_certs(); if !der_certs.errors.is_empty() { bail!("could not parse certificates: {:?}", der_certs.errors); } - let mut store = rustls::RootCertStore::empty(); store.add_parsable_certificates(der_certs.certs); + + Ok(()) +} + +fn load_compute_certs() -> anyhow::Result> { + let mut store = rustls::RootCertStore::empty(); + load_native_certs(&mut store)?; + load_internal_certs(&mut store)?; Ok(Arc::new(store)) } @@ -22,7 +54,7 @@ pub fn compute_client_config_with_root_certs() -> anyhow::Result, + // unfortunate split since we cannot change the ALPN on demand. + // + pub http_config: Arc, + pub pg_config: Arc, pub common_names: HashSet, pub cert_resolver: Arc, } -impl TlsConfig { - pub fn to_server_config(&self) -> Arc { - self.config.clone() - } -} - /// Configure TLS for the main endpoint. pub fn configure_tls( key_path: &str, @@ -71,8 +68,15 @@ pub fn configure_tls( config.key_log = Arc::new(rustls::KeyLogFile::new()); } + let mut http_config = config.clone(); + let mut pg_config = config; + + http_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; + pg_config.alpn_protocols = vec![b"postgresql".to_vec()]; + Ok(TlsConfig { - config: Arc::new(config), + http_config: Arc::new(http_config), + pg_config: Arc::new(pg_config), common_names, cert_resolver, }) diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 004d268fa1..2b27dc5c76 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -44,11 +44,17 @@ const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60); pub(crate) struct Ids { pub(crate) endpoint_id: EndpointIdInt, pub(crate) branch_id: BranchIdInt, - pub(crate) direction: TrafficDirection, #[serde(with = "none_as_empty_string")] pub(crate) private_link_id: Option, } +#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] +struct Extra { + #[serde(flatten)] + ids: Ids, + direction: TrafficDirection, +} + mod none_as_empty_string { use serde::Deserialize; use smol_str::SmolStr; @@ -76,18 +82,23 @@ pub(crate) enum TrafficDirection { pub(crate) trait MetricCounterRecorder { /// Record that some bytes were sent from the proxy to the client fn record_egress(&self, bytes: u64); + + /// Record that some bytes were sent from the client to the proxy + fn record_ingress(&self, bytes: u64); + /// Record that some connections were opened fn record_connection(&self, count: usize); } trait MetricCounterReporter { - fn get_metrics(&mut self) -> (u64, usize); - fn move_metrics(&self) -> (u64, usize); + fn get_metrics(&mut self) -> MetricsData; + fn move_metrics(&self) -> MetricsData; } #[derive(Debug)] pub(crate) struct MetricCounter { transmitted: AtomicU64, + received: AtomicU64, opened_connections: AtomicUsize, } @@ -97,6 +108,11 @@ impl MetricCounterRecorder for MetricCounter { self.transmitted.fetch_add(bytes, Ordering::Relaxed); } + /// Record that some bytes were sent from the proxy to the client + fn record_ingress(&self, bytes: u64) { + self.received.fetch_add(bytes, Ordering::Relaxed); + } + /// Record that some connections were opened fn record_connection(&self, count: usize) { self.opened_connections.fetch_add(count, Ordering::Relaxed); @@ -104,29 +120,43 @@ impl MetricCounterRecorder for MetricCounter { } impl MetricCounterReporter for MetricCounter { - fn get_metrics(&mut self) -> (u64, usize) { - ( - *self.transmitted.get_mut(), - *self.opened_connections.get_mut(), - ) + fn get_metrics(&mut self) -> MetricsData { + MetricsData { + received: *self.received.get_mut(), + transmitted: *self.transmitted.get_mut(), + connections: *self.opened_connections.get_mut(), + } } - fn move_metrics(&self) -> (u64, usize) { - ( - self.transmitted.swap(0, Ordering::Relaxed), - self.opened_connections.swap(0, Ordering::Relaxed), - ) + + fn move_metrics(&self) -> MetricsData { + MetricsData { + received: self.received.swap(0, Ordering::Relaxed), + transmitted: self.transmitted.swap(0, Ordering::Relaxed), + connections: self.opened_connections.swap(0, Ordering::Relaxed), + } } } +struct MetricsData { + transmitted: u64, + received: u64, + connections: usize, +} + +struct BytesSent { + transmitted: u64, + received: u64, +} + trait Clearable { /// extract the value that should be reported - fn should_report(self: &Arc) -> Option; + fn should_report(self: &Arc) -> Option; /// Determine whether the counter should be cleared from the global map. fn should_clear(self: &mut Arc) -> bool; } impl Clearable for C { - fn should_report(self: &Arc) -> Option { + fn should_report(self: &Arc) -> Option { // heuristic to see if the branch is still open // if a clone happens while we are observing, the heuristic will be incorrect. // @@ -139,14 +169,21 @@ impl Clearable for C { // (to avoid sending the same metrics twice) // see the relevant discussion on why to do so even if the status is not success: // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 - let (value, opened) = self.move_metrics(); + let MetricsData { + transmitted, + received, + connections, + } = self.move_metrics(); // Our only requirement is that we report in every interval if there was an open connection // if there were no opened connections since, then we don't need to report - if value == 0 && !is_open && opened == 0 { + if transmitted == 0 && received == 0 && !is_open && connections == 0 { None } else { - Some(value) + Some(BytesSent { + transmitted, + received, + }) } } fn should_clear(self: &mut Arc) -> bool { @@ -154,9 +191,13 @@ impl Clearable for C { let Some(counter) = Arc::get_mut(self) else { return false; }; - let (opened, value) = counter.get_metrics(); + let MetricsData { + transmitted, + received, + connections, + } = counter.get_metrics(); // clear if there's no data to report - value == 0 && opened == 0 + transmitted == 0 && received == 0 && connections == 0 } } @@ -178,6 +219,7 @@ impl Metrics { .entry(ids) .or_insert_with(|| { Arc::new(MetricCounter { + received: AtomicU64::new(0), transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), }) @@ -242,10 +284,10 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result( endpoints: &ClashMap, FastHasher>, -) -> Vec<(Ids, u64)> { +) -> Vec<(Ids, BytesSent)> { let mut metrics_to_clear = Vec::new(); - let metrics_to_send: Vec<(Ids, u64)> = endpoints + let metrics_to_send: Vec<(Ids, BytesSent)> = endpoints .iter() .filter_map(|counter| { let key = counter.key().clone(); @@ -271,26 +313,46 @@ fn collect_and_clear_metrics( } fn create_event_chunks<'a>( - metrics_to_send: &'a [(Ids, u64)], + metrics_to_send: &'a [(Ids, BytesSent)], hostname: &'a str, prev: DateTime, now: DateTime, chunk_size: usize, -) -> impl Iterator>> + 'a { +) -> impl Iterator>> + 'a { metrics_to_send .chunks(chunk_size) .map(move |chunk| EventChunk { events: chunk .iter() - .map(|(ids, value)| Event { - kind: EventType::Incremental { - start_time: prev, - stop_time: now, - }, - metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname), - value: *value, - extra: ids.clone(), + .flat_map(|(ids, bytes)| { + [ + Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: bytes.transmitted, + extra: Extra { + ids: ids.clone(), + direction: TrafficDirection::Egress, + }, + }, + Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: bytes.received, + extra: Extra { + ids: ids.clone(), + direction: TrafficDirection::Ingress, + }, + }, + ] }) .collect(), }) @@ -350,7 +412,7 @@ fn create_remote_path_prefix(now: DateTime) -> String { async fn upload_main_events_chunked( client: &http::ClientWithMiddleware, metric_collection_endpoint: &reqwest::Url, - chunk: &EventChunk<'_, Event>, + chunk: &EventChunk<'_, Event>, subchunk_size: usize, ) { // Split into smaller chunks to avoid exceeding the max request size @@ -384,7 +446,7 @@ async fn upload_main_events_chunked( async fn upload_backup_events( storage: Option<&GenericRemoteStorage>, - chunk: &EventChunk<'_, Event>, + chunk: &EventChunk<'_, Event>, path_prefix: &str, cancel: &CancellationToken, ) -> anyhow::Result<()> { @@ -461,7 +523,7 @@ mod tests { #[tokio::test] async fn metrics() { - type Report = EventChunk<'static, Event>; + type Report = EventChunk<'static, Event>; let reports: Arc>> = Arc::default(); let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); @@ -533,7 +595,6 @@ mod tests { let counter = metrics.register(Ids { endpoint_id: (&EndpointId::from("e1")).into(), branch_id: (&BranchId::from("b1")).into(), - direction: TrafficDirection::Egress, private_link_id: None, }); @@ -551,13 +612,19 @@ mod tests { .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); - assert_eq!(r[0].events.len(), 1); + assert_eq!(r[0].events.len(), 2); assert_eq!(r[0].events[0].value, 0); + assert_eq!(r[0].events[0].extra.direction, TrafficDirection::Egress); + assert_eq!(r[0].events[1].value, 0); + assert_eq!(r[0].events[1].extra.direction, TrafficDirection::Ingress); pushed_chunks.extend(r); // record egress counter.record_egress(1); + // record ingress + counter.record_ingress(2); + // egress should be observered collect_metrics_iteration( &metrics.endpoints, @@ -572,8 +639,11 @@ mod tests { .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); - assert_eq!(r[0].events.len(), 1); + assert_eq!(r[0].events.len(), 2); assert_eq!(r[0].events[0].value, 1); + assert_eq!(r[0].events[0].extra.direction, TrafficDirection::Egress); + assert_eq!(r[0].events[1].value, 2); + assert_eq!(r[0].events[1].extra.direction, TrafficDirection::Ingress); pushed_chunks.extend(r); // release counter diff --git a/pyproject.toml b/pyproject.toml index c6e5073bcd..e009b0773e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ requests = "^2.32.3" pytest-xdist = "^3.3.1" asyncpg = "^0.30.0" aiopg = "^1.4.0" -Jinja2 = "^3.1.5" +Jinja2 = "^3.1.6" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.20241019" boto3 = "^1.34.11" @@ -48,8 +48,8 @@ types-jwcrypto = "^1.5.0.20240925" pyyaml = "^6.0.2" types-pyyaml = "^6.0.12.20240917" testcontainers = "^4.9.0" -# Jsonnet doesn't support Python 3.13 yet -jsonnet = { version = "^0.20.0", markers = "python_version < '3.13'" } +# Install a release candidate of `jsonnet`, as it supports Python 3.13 +jsonnet = "^0.21.0-rc2" [tool.poetry.group.dev.dependencies] mypy = "==1.13.0" diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index c86ac576ad..965aa7504b 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "safekeeper" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] @@ -35,8 +35,9 @@ postgres-protocol.workspace = true pprof.workspace = true rand.workspace = true regex.workspace = true -scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } +rustls.workspace = true +scopeguard.workspace = true serde.workspace = true serde_json.workspace = true smallvec.workspace = true @@ -45,10 +46,11 @@ strum_macros.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["fs"] } -tokio-util = { workspace = true } tokio-io-timeout.workspace = true tokio-postgres.workspace = true +tokio-rustls.workspace = true tokio-tar.workspace = true +tokio-util = { workspace = true } tracing.workspace = true url.workspace = true metrics.workspace = true diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index 19c6662e74..122630d953 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -4,7 +4,7 @@ use std::io::Write as _; use bytes::BytesMut; use camino_tempfile::tempfile; -use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; +use criterion::{BatchSize, Bencher, Criterion, criterion_group, criterion_main}; use itertools::Itertools as _; use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; use pprof::criterion::{Output, PProfProfiler}; @@ -13,6 +13,7 @@ use safekeeper::safekeeper::{ AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, }; use safekeeper::test_utils::Env; +use safekeeper_api::membership::SafekeeperGeneration as Generation; use tokio::io::AsyncWriteExt as _; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; @@ -26,7 +27,7 @@ const GB: usize = 1024 * MB; static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; // Register benchmarks with Criterion. @@ -88,13 +89,12 @@ fn bench_process_msg(c: &mut Criterion) { let (lsn, record) = walgen.next().expect("endless WAL"); ProposerAcceptorMessage::AppendRequest(AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }) @@ -160,13 +160,12 @@ fn bench_wal_acceptor(c: &mut Criterion) { .take(n) .map(|(lsn, record)| AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }) @@ -262,13 +261,12 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) { runtime.block_on(async { let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }); diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 5c305769dd..7ae39ef95e 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -3,17 +3,16 @@ //! Partially copied from pageserver client; some parts might be better to be //! united. +use std::error::Error as _; + use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, StatusCode}; use safekeeper_api::models::{ - PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, TimelineStatus, }; -use std::error::Error as _; -use utils::{ - id::{NodeId, TenantId, TimelineId}, - logging::SecretString, -}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::logging::SecretString; #[derive(Debug, Clone)] pub struct Client { @@ -38,6 +37,10 @@ pub enum Error { #[error("Cancelled")] Cancelled, + + /// Failed to create client. + #[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] + CreateClient(reqwest::Error), } pub type Result = std::result::Result; @@ -65,11 +68,7 @@ impl ResponseErrorMessageExt for reqwest::Response { } impl Client { - pub fn new(mgmt_api_endpoint: String, jwt: Option) -> Self { - Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt) - } - - pub fn from_client( + pub fn new( client: reqwest::Client, mgmt_api_endpoint: String, jwt: Option, @@ -97,11 +96,25 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn exclude_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/exclude", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.put(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_timeline( &self, tenant_id: TenantId, timeline_id: TimelineId, - ) -> Result { + ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}", self.mgmt_api_endpoint, tenant_id, timeline_id @@ -110,6 +123,20 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn bump_timeline_term( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineTermBumpRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/term_bump", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.post(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn timeline_status( &self, tenant_id: TenantId, @@ -150,6 +177,14 @@ impl Client { self.request(Method::POST, uri, body).await } + async fn put( + &self, + uri: U, + body: B, + ) -> Result { + self.request(Method::PUT, uri, body).await + } + async fn get(&self, uri: U) -> Result { self.request(Method::GET, uri, ()).await } @@ -173,12 +208,10 @@ impl Client { uri: U, body: B, ) -> Result { - let req = self.client.request(method, uri); - let req = if let Some(value) = &self.authorization_header { - req.header(reqwest::header::AUTHORIZATION, value.get_contents()) - } else { - req - }; + let mut req = self.client.request(method, uri); + if let Some(value) = &self.authorization_header { + req = req.header(reqwest::header::AUTHORIZATION, value.get_contents()) + } req.json(&body).send().await.map_err(Error::ReceiveBody) } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 6cc53e0d23..9ca79de179 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -1,52 +1,43 @@ // // Main entry point for the safekeeper executable // -use anyhow::{bail, Context, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use clap::{ArgAction, Parser}; -use futures::future::BoxFuture; -use futures::stream::FuturesUnordered; -use futures::{FutureExt, StreamExt}; -use remote_storage::RemoteStorageConfig; -use sd_notify::NotifyState; -use tokio::runtime::Handle; -use tokio::signal::unix::{signal, SignalKind}; -use tokio::task::JoinError; -use utils::logging::SecretString; - -use std::env::{var, VarError}; +use std::env::{VarError, var}; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; -use storage_broker::Uri; - -use tracing::*; -use utils::pid_file; +use anyhow::{Context, Result, bail}; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::{ArgAction, Parser}; +use futures::future::BoxFuture; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, StreamExt}; use metrics::set_build_info_metric; +use remote_storage::RemoteStorageConfig; +use reqwest::Certificate; use safekeeper::defaults::{ DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, - DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, + DEFAULT_SSL_KEY_FILE, }; -use safekeeper::http; -use safekeeper::wal_service; -use safekeeper::GlobalTimelines; -use safekeeper::SafeKeeperConf; -use safekeeper::{broker, WAL_SERVICE_RUNTIME}; -use safekeeper::{control_file, BROKER_RUNTIME}; -use safekeeper::{wal_backup, HTTP_RUNTIME}; -use storage_broker::DEFAULT_ENDPOINT; +use safekeeper::{ + BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, WAL_SERVICE_RUNTIME, broker, + control_file, http, wal_backup, wal_service, +}; +use sd_notify::NotifyState; +use storage_broker::{DEFAULT_ENDPOINT, Uri}; +use tokio::runtime::Handle; +use tokio::signal::unix::{SignalKind, signal}; +use tokio::task::JoinError; +use tracing::*; use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; -use utils::{ - id::NodeId, - logging::{self, LogFormat}, - project_build_tag, project_git_version, - sentry_init::init_sentry, - tcp_listener, -}; +use utils::id::NodeId; +use utils::logging::{self, LogFormat, SecretString}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version, tcp_listener}; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; @@ -55,7 +46,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "safekeeper.pid"; @@ -105,6 +96,9 @@ struct Args { /// Listen http endpoint for management and metrics in the form host:port. #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)] listen_http: String, + /// Listen https endpoint for management and metrics in the form host:port. + #[arg(long, default_value = None)] + listen_https: Option, /// Advertised endpoint for receiving/sending WAL in the form host:port. If not /// specified, listen_pg is used to advertise instead. #[arg(long, default_value = None)] @@ -214,6 +208,15 @@ struct Args { /// and the current position of the reader is smaller than this value. #[arg(long)] max_delta_for_fanout: Option, + /// Path to a file with certificate's private key for https API. + #[arg(long, default_value = DEFAULT_SSL_KEY_FILE)] + ssl_key_file: Utf8PathBuf, + /// Path to a file with a X509 certificate for https API. + #[arg(long, default_value = DEFAULT_SSL_CERT_FILE)] + ssl_cert_file: Utf8PathBuf, + /// Trusted root CA certificate to use in https APIs. + #[arg(long)] + ssl_ca_file: Option, } // Like PathBufValueParser, but allows empty string. @@ -347,12 +350,22 @@ async fn main() -> anyhow::Result<()> { } }; + let ssl_ca_cert = match args.ssl_ca_file.as_ref() { + Some(ssl_ca_file) => { + tracing::info!("Using ssl root CA file: {ssl_ca_file:?}"); + let buf = tokio::fs::read(ssl_ca_file).await?; + Some(Certificate::from_pem(&buf)?) + } + None => None, + }; + let conf = Arc::new(SafeKeeperConf { workdir, my_id: id, listen_pg_addr: args.listen_pg, listen_pg_addr_tenant_only: args.listen_pg_tenant_only, listen_http_addr: args.listen_http, + listen_https_addr: args.listen_https, advertise_pg_addr: args.advertise_pg, availability_zone: args.availability_zone, no_sync: args.no_sync, @@ -379,6 +392,9 @@ async fn main() -> anyhow::Result<()> { eviction_min_resident: args.eviction_min_resident, wal_reader_fanout: args.wal_reader_fanout, max_delta_for_fanout: args.max_delta_for_fanout, + ssl_key_file: args.ssl_key_file, + ssl_cert_file: args.ssl_cert_file, + ssl_ca_cert, }); // initialize sentry if SENTRY_DSN is provided @@ -439,6 +455,17 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { e })?; + let https_listener = match conf.listen_https_addr.as_ref() { + Some(listen_https_addr) => { + info!("starting safekeeper HTTPS service on {}", listen_https_addr); + Some(tcp_listener::bind(listen_https_addr).map_err(|e| { + error!("failed to bind to address {}: {}", listen_https_addr, e); + e + })?) + } + None => None, + }; + let global_timelines = Arc::new(GlobalTimelines::new(conf.clone())); // Register metrics collector for active timelines. It's important to do this @@ -512,7 +539,7 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { let http_handle = current_thread_rt .as_ref() .unwrap_or_else(|| HTTP_RUNTIME.handle()) - .spawn(http::task_main( + .spawn(http::task_main_http( conf.clone(), http_listener, global_timelines.clone(), @@ -520,6 +547,19 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { .map(|res| ("HTTP service main".to_owned(), res)); tasks_handles.push(Box::pin(http_handle)); + if let Some(https_listener) = https_listener { + let https_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| HTTP_RUNTIME.handle()) + .spawn(http::task_main_https( + conf.clone(), + https_listener, + global_timelines.clone(), + )) + .map(|res| ("HTTPS service main".to_owned(), res)); + tasks_handles.push(Box::pin(https_handle)); + } + let broker_task_handle = current_thread_rt .as_ref() .unwrap_or_else(|| BROKER_RUNTIME.handle()) diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 4b091e2c29..de6e275124 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,39 +1,25 @@ //! Communication with the broker, providing safekeeper peers and pageserver coordination. -use anyhow::anyhow; -use anyhow::bail; -use anyhow::Context; - -use anyhow::Error; -use anyhow::Result; - -use storage_broker::parse_proto_ttid; - -use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; -use storage_broker::proto::FilterTenantTimelineId; -use storage_broker::proto::MessageType; -use storage_broker::proto::SafekeeperDiscoveryResponse; -use storage_broker::proto::SubscribeByFilterRequest; -use storage_broker::proto::SubscribeSafekeeperInfoRequest; -use storage_broker::proto::TypeSubscription; -use storage_broker::proto::TypedMessage; -use storage_broker::Request; - -use std::sync::atomic::AtomicU64; use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; -use std::time::UNIX_EPOCH; +use std::sync::atomic::AtomicU64; +use std::time::{Duration, Instant, UNIX_EPOCH}; + +use anyhow::{Context, Error, Result, anyhow, bail}; +use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryResponse, SubscribeByFilterRequest, + SubscribeSafekeeperInfoRequest, TypeSubscription, TypedMessage, +}; +use storage_broker::{Request, parse_proto_ttid}; use tokio::task::JoinHandle; use tokio::time::sleep; use tracing::*; -use crate::metrics::BROKER_ITERATION_TIMELINES; -use crate::metrics::BROKER_PULLED_UPDATES; -use crate::metrics::BROKER_PUSHED_UPDATES; -use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; +use crate::metrics::{ + BROKER_ITERATION_TIMELINES, BROKER_PULLED_UPDATES, BROKER_PUSH_ALL_UPDATES_SECONDS, + BROKER_PUSHED_UPDATES, +}; +use crate::{GlobalTimelines, SafeKeeperConf}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 35aebfd8ad..1bf3e4cac1 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -1,24 +1,23 @@ //! Control file serialization, deserialization and persistence. -use anyhow::{bail, ensure, Context, Result}; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use camino::{Utf8Path, Utf8PathBuf}; -use safekeeper_api::membership::INVALID_GENERATION; -use tokio::fs::File; -use tokio::io::AsyncWriteExt; -use utils::crashsafe::durable_rename; - use std::future::Future; use std::io::Read; use std::ops::Deref; use std::path::Path; use std::time::Instant; -use crate::control_file_upgrade::downgrade_v10_to_v9; -use crate::control_file_upgrade::upgrade_control_file; +use anyhow::{Context, Result, bail, ensure}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use camino::{Utf8Path, Utf8PathBuf}; +use safekeeper_api::membership::INVALID_GENERATION; +use tokio::fs::File; +use tokio::io::AsyncWriteExt; +use utils::bin_ser::LeSer; +use utils::crashsafe::durable_rename; + +use crate::control_file_upgrade::{downgrade_v10_to_v9, upgrade_control_file}; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::state::{EvictionState, TimelinePersistentState}; -use utils::bin_ser::LeSer; pub const SK_MAGIC: u32 = 0xcafeceefu32; pub const SK_FORMAT_VERSION: u32 = 10; @@ -234,11 +233,12 @@ impl Storage for FileStorage { #[cfg(test)] mod test { - use super::*; use safekeeper_api::membership::{Configuration, MemberSet, SafekeeperGeneration}; use tokio::fs; use utils::lsn::Lsn; + use super::*; + const NO_SYNC: bool = true; #[tokio::test] diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 904e79f976..1ad9e62f9b 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,24 +1,19 @@ //! Code to deal with safekeeper control file upgrades use std::vec; -use crate::{ - safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}, - state::{EvictionState, TimelinePersistentState}, - wal_backup_partial, -}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use pq_proto::SystemId; -use safekeeper_api::{ - membership::{Configuration, INVALID_GENERATION}, - ServerInfo, Term, -}; +use safekeeper_api::membership::{Configuration, INVALID_GENERATION}; +use safekeeper_api::{ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::*; -use utils::{ - bin_ser::LeSer, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::bin_ser::LeSer; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; + +use crate::safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}; +use crate::state::{EvictionState, TimelinePersistentState}; +use crate::wal_backup_partial; /// Persistent consensus state of the acceptor. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -552,11 +547,11 @@ pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersisten mod tests { use std::str::FromStr; - use utils::{id::NodeId, Hex}; - - use crate::control_file_upgrade::PersistedPeerInfo; + use utils::Hex; + use utils::id::NodeId; use super::*; + use crate::control_file_upgrade::PersistedPeerInfo; #[test] fn roundtrip_v1() { diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 10a761e1f5..11daff22cb 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -1,24 +1,22 @@ -use anyhow::{bail, Result}; +use std::sync::Arc; + +use anyhow::{Result, bail}; use camino::Utf8PathBuf; use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; use safekeeper_api::membership::Configuration; -use std::sync::Arc; -use tokio::{ - fs::OpenOptions, - io::{AsyncSeekExt, AsyncWriteExt}, -}; +use tokio::fs::OpenOptions; +use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tracing::{info, warn}; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; -use crate::{ - control_file::FileStorage, - state::TimelinePersistentState, - timeline::{TimelineError, WalResidentTimeline}, - timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, - wal_backup::copy_s3_segments, - wal_storage::{wal_file_paths, WalReader}, - GlobalTimelines, -}; +use crate::GlobalTimelines; +use crate::control_file::FileStorage; +use crate::state::TimelinePersistentState; +use crate::timeline::{TimelineError, WalResidentTimeline}; +use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; +use crate::wal_backup::copy_s3_segments; +use crate::wal_storage::{WalReader, wal_file_paths}; // we don't want to have more than 10 segments on disk after copy, because they take space const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64; diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 19362a0992..68a38e1498 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -2,37 +2,25 @@ use std::fs; use std::fs::DirEntry; -use std::io::BufReader; -use std::io::Read; +use std::io::{BufReader, Read}; use std::path::PathBuf; use std::sync::Arc; -use anyhow::bail; -use anyhow::Result; -use camino::Utf8Path; -use camino::Utf8PathBuf; +use anyhow::{Result, bail}; +use camino::{Utf8Path, Utf8PathBuf}; use chrono::{DateTime, Utc}; -use postgres_ffi::XLogSegNo; -use postgres_ffi::MAX_SEND_SIZE; -use safekeeper_api::models::WalSenderState; -use serde::Deserialize; -use serde::Serialize; - use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName}; +use postgres_ffi::{MAX_SEND_SIZE, XLogSegNo}; +use safekeeper_api::models::WalSenderState; +use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; -use utils::id::NodeId; -use utils::id::TenantTimelineId; -use utils::id::{TenantId, TimelineId}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; use crate::safekeeper::TermHistory; -use crate::state::TimelineMemState; -use crate::state::TimelinePersistentState; -use crate::timeline::get_timeline_dir; -use crate::timeline::WalResidentTimeline; -use crate::timeline_manager; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; +use crate::state::{TimelineMemState, TimelinePersistentState}; +use crate::timeline::{WalResidentTimeline, get_timeline_dir}; +use crate::{GlobalTimelines, SafeKeeperConf, timeline_manager}; /// Various filters that influence the resulting JSON output. #[derive(Debug, Serialize, Deserialize, Clone)] diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index e77eeb4130..5ca3d1b7c2 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -1,35 +1,31 @@ //! Part of Safekeeper pretending to be Postgres, i.e. handling Postgres //! protocol commands. +use std::future::Future; +use std::str::{self, FromStr}; +use std::sync::Arc; + use anyhow::Context; use pageserver_api::models::ShardParameters; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; -use safekeeper_api::models::ConnectionId; +use postgres_backend::{PostgresBackend, QueryError}; +use postgres_ffi::PG_TLI; +use pq_proto::{BeMessage, FeStartupPacket, INT4_OID, RowDescriptor, TEXT_OID}; +use regex::Regex; use safekeeper_api::Term; -use std::future::Future; -use std::str::{self, FromStr}; -use std::sync::Arc; +use safekeeper_api::models::ConnectionId; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{debug, info, info_span, Instrument}; +use tracing::{Instrument, debug, info, info_span}; +use utils::auth::{Claims, JwtAuth, Scope}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use utils::postgres_client::PostgresClientProtocol; use utils::shard::{ShardCount, ShardNumber}; use crate::auth::check_permission; -use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; - -use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE}; +use crate::metrics::{PG_QUERIES_GAUGE, TrafficMetrics}; use crate::timeline::TimelineError; use crate::{GlobalTimelines, SafeKeeperConf}; -use postgres_backend::PostgresBackend; -use postgres_backend::QueryError; -use postgres_ffi::PG_TLI; -use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; -use regex::Regex; -use utils::auth::{Claims, JwtAuth, Scope}; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; /// Safekeeper handler of postgres commands pub struct SafekeeperPostgresHandler { @@ -65,9 +61,6 @@ enum SafekeeperPostgresCommand { }, IdentifySystem, TimelineStatus, - JSONCtrl { - cmd: AppendLogicalMessage, - }, } fn parse_cmd(cmd: &str) -> anyhow::Result { @@ -137,11 +130,6 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { Ok(SafekeeperPostgresCommand::IdentifySystem) } else if cmd.starts_with("TIMELINE_STATUS") { Ok(SafekeeperPostgresCommand::TimelineStatus) - } else if cmd.starts_with("JSON_CTRL") { - let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?; - Ok(SafekeeperPostgresCommand::JSONCtrl { - cmd: serde_json::from_str(cmd)?, - }) } else { anyhow::bail!("unsupported command {cmd}"); } @@ -153,7 +141,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS", SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", - SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL", } } @@ -362,9 +349,6 @@ impl postgres_backend::Handler } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd).await - } } }) } diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 6e160b7a5e..4908863a4b 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,12 +1,13 @@ pub mod routes; -pub use routes::make_router; - -pub use safekeeper_api::models; use std::sync::Arc; +pub use routes::make_router; +pub use safekeeper_api::models; +use tokio_util::sync::CancellationToken; + use crate::{GlobalTimelines, SafeKeeperConf}; -pub async fn task_main( +pub async fn task_main_http( conf: Arc, http_listener: std::net::TcpListener, global_timelines: Arc, @@ -14,8 +15,37 @@ pub async fn task_main( let router = make_router(conf, global_timelines) .build() .map_err(|err| anyhow::anyhow!(err))?; - let service = http_utils::RouterService::new(router).unwrap(); - let server = hyper::Server::from_tcp(http_listener)?; - server.serve(service).await?; + + let service = Arc::new( + http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow::anyhow!(err))?, + ); + let server = http_utils::server::Server::new(service, http_listener, None)?; + server.serve(CancellationToken::new()).await?; + Ok(()) // unreachable +} + +pub async fn task_main_https( + conf: Arc, + https_listener: std::net::TcpListener, + global_timelines: Arc, +) -> anyhow::Result<()> { + let certs = http_utils::tls_certs::load_cert_chain(&conf.ssl_cert_file)?; + let key = http_utils::tls_certs::load_private_key(&conf.ssl_key_file)?; + + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_single_cert(certs, key)?; + + let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + + let router = make_router(conf, global_timelines) + .build() + .map_err(|err| anyhow::anyhow!(err))?; + + let service = Arc::new( + http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow::anyhow!(err))?, + ); + let server = http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?; + server.serve(CancellationToken::new()).await?; Ok(()) // unreachable } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index cd2ac5f44c..3299d77545 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,51 +1,42 @@ -use http_utils::failpoints::failpoints_handler; -use hyper::{Body, Request, Response, StatusCode}; -use safekeeper_api::models; -use safekeeper_api::models::AcceptorStateStatus; -use safekeeper_api::models::PullTimelineRequest; -use safekeeper_api::models::SafekeeperStatus; -use safekeeper_api::models::TermSwitchApiEntry; -use safekeeper_api::models::TimelineStatus; -use safekeeper_api::ServerInfo; use std::collections::HashMap; use std::fmt; use std::io::Write as _; use std::str::FromStr; use std::sync::Arc; -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; + +use http_utils::endpoint::{ + self, ChannelWriter, auth_middleware, check_permission_with, profile_cpu_handler, + profile_heap_handler, prometheus_metrics_handler, request_span, +}; +use http_utils::error::ApiError; +use http_utils::failpoints::failpoints_handler; +use http_utils::json::{json_request, json_response}; +use http_utils::request::{ensure_no_body, parse_query_param, parse_request_param}; +use http_utils::{RequestExt, RouterBuilder}; +use hyper::{Body, Request, Response, StatusCode}; +use postgres_ffi::WAL_SEGMENT_SIZE; +use safekeeper_api::models::{ + AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry, + TimelineCopyRequest, TimelineCreateRequest, TimelineDeleteResult, TimelineStatus, + TimelineTermBumpRequest, +}; +use safekeeper_api::{ServerInfo, membership, models}; +use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; use tokio::sync::mpsc; use tokio::task; use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; - -use http_utils::endpoint::{ - profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, -}; -use http_utils::{ - endpoint::{self, auth_middleware, check_permission_with, ChannelWriter}, - error::ApiError, - json::{json_request, json_response}, - request::{ensure_no_body, parse_query_param, parse_request_param}, - RequestExt, RouterBuilder, -}; - -use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; -use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest}; -use utils::{ - auth::SwappableJwtAuth, - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use tracing::{Instrument, info_span}; +use utils::auth::SwappableJwtAuth; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use crate::debug_dump::TimelineDigestRequest; use crate::safekeeper::TermLsn; -use crate::timelines_global_map::TimelineDeleteForceResult; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; -use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline}; +use crate::timelines_global_map::DeleteOrExclude; +use crate::{ + GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline, +}; /// Healthcheck handler. async fn status_handler(request: Request) -> Result, ApiError> { @@ -83,10 +74,13 @@ async fn tenant_delete_handler(mut request: Request) -> Result) -> Result>(), + .collect::>(), ) } @@ -218,12 +212,15 @@ async fn timeline_delete_handler(mut request: Request) -> Result) -> Result) -> Result for ApiError { + fn from(de: DeleteOrExcludeError) -> ApiError { + match de { + DeleteOrExcludeError::Conflict { + requested: _, + current: _, + } => ApiError::Conflict(de.to_string()), + DeleteOrExcludeError::Other(e) => ApiError::InternalServerError(e), + } + } +} + +/// Remove timeline locally after this node has been excluded from the +/// membership configuration. The body is the same as in the membership endpoint +/// -- conf where node is excluded -- and in principle single ep could be used +/// for both actions, but since this is a data deletion op let's keep them +/// separate. +async fn timeline_exclude_handler(mut request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let global_timelines = get_global_timelines(&request); + let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?; + let my_id = get_conf(&request).my_id; + // If request doesn't exclude us, membership switch endpoint should be used + // instead. + if data.mconf.contains(my_id) { + return Err(ApiError::Forbidden(format!( + "refused to switch into {}, node {} is member of it", + data.mconf, my_id + ))); + } + let action = DeleteOrExclude::Exclude(data.mconf); + + let resp = global_timelines + .delete_or_exclude(&ttid, action) + .await + .map_err(ApiError::from)?; + json_response(StatusCode::OK, resp) +} + /// Consider switching timeline membership configuration to the provided one. async fn timeline_membership_handler( mut request: Request, @@ -291,12 +351,29 @@ async fn timeline_membership_handler( let tli = global_timelines.get(ttid).map_err(ApiError::from)?; let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?; + let my_id = get_conf(&request).my_id; + // If request excludes us, exclude endpoint should be used instead. + if !data.mconf.contains(my_id) { + return Err(ApiError::Forbidden(format!( + "refused to switch into {}, node {} is not a member of it", + data.mconf, my_id + ))); + } + let req_gen = data.mconf.generation; let response = tli .membership_switch(data.mconf) .await .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::OK, response) + // Return 409 if request was ignored. + if req_gen == response.current_conf.generation { + json_response(StatusCode::OK, response) + } else { + Err(ApiError::Conflict(format!( + "request to switch into {} ignored, current generation {}", + req_gen, response.current_conf.generation + ))) + } } async fn timeline_copy_handler(mut request: Request) -> Result, ApiError> { @@ -647,11 +724,14 @@ pub fn make_router( .post("/v1/pull_timeline", |r| { request_span(r, timeline_pull_handler) }) + .put("/v1/tenant/:tenant_id/timeline/:timeline_id/exclude", |r| { + request_span(r, timeline_exclude_handler) + }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id", |r| request_span(r, timeline_snapshot_handler), ) - .post( + .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/membership", |r| request_span(r, timeline_membership_handler), ) diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs deleted file mode 100644 index 19e17c4a75..0000000000 --- a/safekeeper/src/json_ctrl.rs +++ /dev/null @@ -1,196 +0,0 @@ -//! -//! This module implements JSON_CTRL protocol, which allows exchange -//! JSON messages over psql for testing purposes. -//! -//! Currently supports AppendLogicalMessage, which is used for WAL -//! modifications in tests. -//! - -use anyhow::Context; -use postgres_backend::QueryError; -use safekeeper_api::membership::Configuration; -use safekeeper_api::{ServerInfo, Term}; -use serde::{Deserialize, Serialize}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::*; - -use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; -use crate::safekeeper::{ - AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, -}; -use crate::safekeeper::{TermHistory, TermLsn}; -use crate::state::TimelinePersistentState; -use crate::timeline::WalResidentTimeline; -use postgres_backend::PostgresBackend; -use postgres_ffi::encode_logical_message; -use postgres_ffi::WAL_SEGMENT_SIZE; -use pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; -use utils::lsn::Lsn; - -#[derive(Serialize, Deserialize, Debug)] -pub struct AppendLogicalMessage { - // prefix and message to build LogicalMessage - pub lm_prefix: String, - pub lm_message: String, - - // if true, commit_lsn will match flush_lsn after append - pub set_commit_lsn: bool, - - // if true, ProposerElected will be sent before append - pub send_proposer_elected: bool, - - // fields from AppendRequestHeader - pub term: Term, - #[serde(with = "utils::lsn::serde_as_u64")] - pub epoch_start_lsn: Lsn, - #[serde(with = "utils::lsn::serde_as_u64")] - pub begin_lsn: Lsn, - #[serde(with = "utils::lsn::serde_as_u64")] - pub truncate_lsn: Lsn, - pub pg_version: u32, -} - -#[derive(Debug, Serialize)] -struct AppendResult { - // safekeeper state after append - state: TimelinePersistentState, - // info about new record in the WAL - inserted_wal: InsertedWAL, -} - -/// Handles command to craft logical message WAL record with given -/// content, and then append it with specified term and lsn. This -/// function is used to test safekeepers in different scenarios. -pub async fn handle_json_ctrl( - spg: &SafekeeperPostgresHandler, - pgb: &mut PostgresBackend, - append_request: &AppendLogicalMessage, -) -> Result<(), QueryError> { - info!("JSON_CTRL request: {append_request:?}"); - - // need to init safekeeper state before AppendRequest - let tli = prepare_safekeeper(spg, append_request.pg_version).await?; - - // if send_proposer_elected is true, we need to update local history - if append_request.send_proposer_elected { - send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn).await?; - } - - let inserted_wal = append_logical_message(&tli, append_request).await?; - let response = AppendResult { - state: tli.get_state().await.1, - inserted_wal, - }; - let response_data = serde_json::to_vec(&response) - .with_context(|| format!("Response {response:?} is not a json array"))?; - - pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor { - name: b"json", - typoid: TEXT_OID, - typlen: -1, - ..Default::default() - }]))? - .write_message_noflush(&BeMessage::DataRow(&[Some(&response_data)]))? - .write_message_noflush(&BeMessage::CommandComplete(b"JSON_CTRL"))?; - Ok(()) -} - -/// Prepare safekeeper to process append requests without crashes, -/// by sending ProposerGreeting with default server.wal_seg_size. -async fn prepare_safekeeper( - spg: &SafekeeperPostgresHandler, - pg_version: u32, -) -> anyhow::Result { - let tli = spg - .global_timelines - .create( - spg.ttid, - Configuration::empty(), - ServerInfo { - pg_version, - wal_seg_size: WAL_SEGMENT_SIZE as u32, - system_id: 0, - }, - Lsn::INVALID, - Lsn::INVALID, - ) - .await?; - - tli.wal_residence_guard().await -} - -async fn send_proposer_elected( - tli: &WalResidentTimeline, - term: Term, - lsn: Lsn, -) -> anyhow::Result<()> { - // add new term to existing history - let history = tli.get_state().await.1.acceptor_state.term_history; - let history = history.up_to(lsn.checked_sub(1u64).unwrap()); - let mut history_entries = history.0; - history_entries.push(TermLsn { term, lsn }); - let history = TermHistory(history_entries); - - let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected { - term, - start_streaming_at: lsn, - term_history: history, - timeline_start_lsn: lsn, - }); - - tli.process_msg(&proposer_elected_request).await?; - Ok(()) -} - -#[derive(Debug, Serialize)] -pub struct InsertedWAL { - begin_lsn: Lsn, - pub end_lsn: Lsn, - append_response: AppendResponse, -} - -/// Extend local WAL with new LogicalMessage record. To do that, -/// create AppendRequest with new WAL and pass it to safekeeper. -pub async fn append_logical_message( - tli: &WalResidentTimeline, - msg: &AppendLogicalMessage, -) -> anyhow::Result { - let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = tli.get_state().await.1; - - let begin_lsn = msg.begin_lsn; - let end_lsn = begin_lsn + wal_data.len() as u64; - - let commit_lsn = if msg.set_commit_lsn { - end_lsn - } else { - sk_state.commit_lsn - }; - - let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { - h: AppendRequestHeader { - term: msg.term, - term_start_lsn: begin_lsn, - begin_lsn, - end_lsn, - commit_lsn, - truncate_lsn: msg.truncate_lsn, - proposer_uuid: [0u8; 16], - }, - wal_data, - }); - - let response = tli.process_msg(&append_request).await?; - - let append_response = match response { - Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, - _ => anyhow::bail!("not AppendResponse"), - }; - - Ok(InsertedWAL { - begin_lsn, - end_lsn, - append_response, - }) -} diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index e0090c638a..7c81f77e55 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -2,15 +2,17 @@ extern crate hyper0 as hyper; +use std::time::Duration; + use camino::Utf8PathBuf; use once_cell::sync::Lazy; use remote_storage::RemoteStorageConfig; -use tokio::runtime::Runtime; - -use std::time::Duration; +use reqwest::Certificate; use storage_broker::Uri; - -use utils::{auth::SwappableJwtAuth, id::NodeId, logging::SecretString}; +use tokio::runtime::Runtime; +use utils::auth::SwappableJwtAuth; +use utils::id::NodeId; +use utils::logging::SecretString; mod auth; pub mod broker; @@ -20,7 +22,6 @@ pub mod copy_timeline; pub mod debug_dump; pub mod handler; pub mod http; -pub mod json_ctrl; pub mod metrics; pub mod patch_control_file; pub mod pull_timeline; @@ -48,6 +49,7 @@ pub mod test_utils; mod timelines_global_map; use std::sync::Arc; + pub use timelines_global_map::GlobalTimelines; use utils::auth::JwtAuth; @@ -68,6 +70,9 @@ pub mod defaults { // before uploading a partial segment, so that in normal operation the eviction can happen // as soon as we have done the partial segment upload. pub const DEFAULT_EVICTION_MIN_RESIDENT: &str = DEFAULT_PARTIAL_BACKUP_TIMEOUT; + + pub const DEFAULT_SSL_KEY_FILE: &str = "server.key"; + pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt"; } #[derive(Debug, Clone)] @@ -83,6 +88,7 @@ pub struct SafeKeeperConf { pub listen_pg_addr: String, pub listen_pg_addr_tenant_only: Option, pub listen_http_addr: String, + pub listen_https_addr: Option, pub advertise_pg_addr: Option, pub availability_zone: Option, pub no_sync: bool, @@ -110,6 +116,9 @@ pub struct SafeKeeperConf { pub eviction_min_resident: Duration, pub wal_reader_fanout: bool, pub max_delta_for_fanout: Option, + pub ssl_key_file: Utf8PathBuf, + pub ssl_cert_file: Utf8PathBuf, + pub ssl_ca_cert: Option, } impl SafeKeeperConf { @@ -126,6 +135,7 @@ impl SafeKeeperConf { listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_pg_addr_tenant_only: None, listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), + listen_https_addr: None, advertise_pg_addr: None, availability_zone: None, remote_storage: None, @@ -154,6 +164,9 @@ impl SafeKeeperConf { eviction_min_resident: Duration::ZERO, wal_reader_fanout: false, max_delta_for_fanout: None, + ssl_key_file: Utf8PathBuf::from(defaults::DEFAULT_SSL_KEY_FILE), + ssl_cert_file: Utf8PathBuf::from(defaults::DEFAULT_SSL_CERT_FILE), + ssl_ca_cert: None, } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 3ea9e3d674..cb21a5f6d2 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -1,30 +1,28 @@ //! Global safekeeper mertics and per-timeline safekeeper metrics. -use std::{ - sync::{Arc, RwLock}, - time::{Instant, SystemTime}, -}; +use std::sync::{Arc, RwLock}; +use std::time::{Instant, SystemTime}; use anyhow::Result; use futures::Future; +use metrics::core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}; +use metrics::proto::MetricFamily; use metrics::{ - core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, - pow2_buckets, - proto::MetricFamily, + DISK_FSYNC_SECONDS_BUCKETS, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, + IntCounterPair, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, pow2_buckets, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, - register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, - IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS, + register_int_gauge_vec, }; use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; -use utils::{id::TenantTimelineId, lsn::Lsn, pageserver_feedback::PageserverFeedback}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -use crate::{ - receive_wal::MSG_QUEUE_SIZE, - state::{TimelineMemState, TimelinePersistentState}, - GlobalTimelines, -}; +use crate::GlobalTimelines; +use crate::receive_wal::MSG_QUEUE_SIZE; +use crate::state::{TimelineMemState, TimelinePersistentState}; // Global metrics across all timelines. pub static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { diff --git a/safekeeper/src/patch_control_file.rs b/safekeeper/src/patch_control_file.rs index 2136d1b5f7..efdbd9b3d7 100644 --- a/safekeeper/src/patch_control_file.rs +++ b/safekeeper/src/patch_control_file.rs @@ -4,7 +4,8 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; use tracing::info; -use crate::{state::TimelinePersistentState, timeline::Timeline}; +use crate::state::TimelinePersistentState; +use crate::timeline::Timeline; #[derive(Deserialize, Debug, Clone)] pub struct Request { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 4827b73074..dab8142dfb 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -1,46 +1,39 @@ -use anyhow::{anyhow, bail, Context, Result}; +use std::cmp::min; +use std::io::{self, ErrorKind}; +use std::sync::Arc; + +use anyhow::{Context, Result, anyhow, bail}; use bytes::Bytes; use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; -use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; -use safekeeper_api::{ - models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}, - Term, -}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; +use reqwest::Certificate; +use safekeeper_api::Term; +use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}; use safekeeper_client::mgmt_api; use safekeeper_client::mgmt_api::Client; use serde::Deserialize; -use std::{ - cmp::min, - io::{self, ErrorKind}, - sync::Arc, -}; -use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; +use tokio::fs::OpenOptions; +use tokio::io::AsyncWrite; +use tokio::sync::mpsc; +use tokio::task; use tokio_tar::{Archive, Builder, Header}; -use tokio_util::{ - io::{CopyToBytes, SinkWriter}, - sync::PollSender, -}; +use tokio_util::io::{CopyToBytes, SinkWriter}; +use tokio_util::sync::PollSender; use tracing::{error, info, instrument}; +use utils::crashsafe::fsync_async_opt; +use utils::id::{NodeId, TenantTimelineId}; +use utils::logging::SecretString; +use utils::lsn::Lsn; +use utils::pausable_failpoint; -use crate::{ - control_file::CONTROL_FILE_NAME, - debug_dump, - state::{EvictionState, TimelinePersistentState}, - timeline::{Timeline, WalResidentTimeline}, - timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, - wal_backup, - wal_storage::open_wal_file, - GlobalTimelines, -}; -use utils::{ - crashsafe::fsync_async_opt, - id::{NodeId, TenantTimelineId}, - logging::SecretString, - lsn::Lsn, - pausable_failpoint, -}; +use crate::control_file::CONTROL_FILE_NAME; +use crate::state::{EvictionState, TimelinePersistentState}; +use crate::timeline::{Timeline, WalResidentTimeline}; +use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; +use crate::wal_storage::open_wal_file; +use crate::{GlobalTimelines, debug_dump, wal_backup}; /// Stream tar archive of timeline to tx. #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] @@ -374,8 +367,13 @@ impl WalResidentTimeline { // change, but as long as older history is strictly part of new that's // fine), but there is no need to do it. if bctx.term != term || bctx.last_log_term != last_log_term { - bail!("term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}", - bctx.term, bctx.last_log_term, term, last_log_term); + bail!( + "term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}", + bctx.term, + bctx.last_log_term, + term, + last_log_term + ); } Ok(()) } @@ -395,6 +393,7 @@ pub struct DebugDumpResponse { pub async fn handle_request( request: PullTimelineRequest, sk_auth_token: Option, + ssl_ca_cert: Option, global_timelines: Arc, ) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( @@ -405,12 +404,18 @@ pub async fn handle_request( bail!("Timeline {} already exists", request.timeline_id); } + let mut http_client = reqwest::Client::builder(); + if let Some(ssl_ca_cert) = ssl_ca_cert { + http_client = http_client.add_root_certificate(ssl_ca_cert); + } + let http_client = http_client.build()?; + let http_hosts = request.http_hosts.clone(); // Figure out statuses of potential donors. let responses: Vec> = futures::future::join_all(http_hosts.iter().map(|url| async { - let cclient = Client::new(url.clone(), sk_auth_token.clone()); + let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone()); let info = cclient .timeline_status(request.tenant_id, request.timeline_id) .await?; @@ -440,13 +445,21 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); - pull_timeline(status, safekeeper_host, sk_auth_token, global_timelines).await + pull_timeline( + status, + safekeeper_host, + sk_auth_token, + http_client, + global_timelines, + ) + .await } async fn pull_timeline( status: TimelineStatus, host: String, sk_auth_token: Option, + http_client: reqwest::Client, global_timelines: Arc, ) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); @@ -463,8 +476,7 @@ async fn pull_timeline( let conf = &global_timelines.get_global_config(); let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; - - let client = Client::new(host.clone(), sk_auth_token.clone()); + let client = Client::new(http_client, host.clone(), sk_auth_token.clone()); // Request stream with basebackup archive. let bb_resp = client .snapshot(status.tenant_id, status.timeline_id, conf.my_id) diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index cb42f6f414..7967acde3f 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -2,35 +2,21 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use crate::handler::SafekeeperPostgresHandler; -use crate::metrics::{ - WAL_RECEIVERS, WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL, - WAL_RECEIVER_QUEUE_SIZE_TOTAL, -}; -use crate::safekeeper::AcceptorProposerMessage; -use crate::safekeeper::ProposerAcceptorMessage; -use crate::timeline::WalResidentTimeline; -use crate::GlobalTimelines; -use anyhow::{anyhow, Context}; -use bytes::BytesMut; -use parking_lot::MappedMutexGuard; -use parking_lot::Mutex; -use parking_lot::MutexGuard; -use postgres_backend::CopyStreamHandlerEnd; -use postgres_backend::PostgresBackend; -use postgres_backend::PostgresBackendReader; -use postgres_backend::QueryError; -use pq_proto::BeMessage; -use safekeeper_api::membership::Configuration; -use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus}; -use safekeeper_api::ServerInfo; use std::future; use std::net::SocketAddr; use std::sync::Arc; -use tokio::io::AsyncRead; -use tokio::io::AsyncWrite; + +use anyhow::{Context, anyhow}; +use bytes::BytesMut; +use parking_lot::{MappedMutexGuard, Mutex, MutexGuard}; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; +use pq_proto::BeMessage; +use safekeeper_api::ServerInfo; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendTimeoutError; -use tokio::sync::mpsc::{channel, Receiver, Sender}; +use tokio::sync::mpsc::{Receiver, Sender, channel}; use tokio::task; use tokio::task::JoinHandle; use tokio::time::{Duration, Instant, MissedTickBehavior}; @@ -39,6 +25,15 @@ use utils::id::TenantTimelineId; use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; +use crate::GlobalTimelines; +use crate::handler::SafekeeperPostgresHandler; +use crate::metrics::{ + WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL, WAL_RECEIVER_QUEUE_SIZE_TOTAL, + WAL_RECEIVERS, +}; +use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; +use crate::timeline::WalResidentTimeline; + const DEFAULT_FEEDBACK_CAPACITY: usize = 8; /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped @@ -281,7 +276,7 @@ impl SafekeeperPostgresHandler { tokio::select! { // todo: add read|write .context to these errors r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r, - r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, + r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r, _ = timeline_cancel.cancelled() => { return Err(CopyStreamHandlerEnd::Cancelled); } @@ -342,8 +337,8 @@ impl NetworkReader<'_, IO> { let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( - "start handshake with walproposer {} sysid {} timeline {}", - self.peer_addr, greeting.system_id, greeting.tli, + "start handshake with walproposer {} sysid {}", + self.peer_addr, greeting.system_id, ); let server_info = ServerInfo { pg_version: greeting.pg_version, @@ -371,7 +366,7 @@ impl NetworkReader<'_, IO> { _ => { return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( "unexpected message {next_msg:?} instead of greeting" - ))) + ))); } }; Ok((tli, next_msg)) @@ -459,6 +454,7 @@ async fn network_write( pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver, + proto_version: u32, ) -> Result<(), CopyStreamHandlerEnd> { let mut buf = BytesMut::with_capacity(128); @@ -496,7 +492,7 @@ async fn network_write( }; buf.clear(); - msg.serialize(&mut buf)?; + msg.serialize(&mut buf, proto_version)?; pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; } } diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index 35394eb6ed..c2760792b8 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -1,39 +1,36 @@ //! This module implements pulling WAL from peer safekeepers if compute can't //! provide it, i.e. safekeeper lags too much. +use std::fmt; +use std::pin::pin; use std::time::SystemTime; -use std::{fmt, pin::pin}; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use futures::StreamExt; use postgres_protocol::message::backend::ReplicationMessage; -use safekeeper_api::models::{PeerInfo, TimelineStatus}; use safekeeper_api::Term; -use tokio::sync::mpsc::{channel, Receiver, Sender}; -use tokio::time::timeout; -use tokio::{ - select, - time::sleep, - time::{self, Duration}, -}; +use safekeeper_api::membership::INVALID_GENERATION; +use safekeeper_api::models::{PeerInfo, TimelineStatus}; +use tokio::select; +use tokio::sync::mpsc::{Receiver, Sender, channel}; +use tokio::time::{self, Duration, sleep, timeout}; use tokio_postgres::replication::ReplicationStream; use tokio_postgres::types::PgLsn; use tracing::*; -use utils::postgres_client::{ConnectionConfigArgs, PostgresClientProtocol}; -use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}; - -use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; -use crate::safekeeper::{AppendRequest, AppendRequestHeader}; -use crate::timeline::WalResidentTimeline; -use crate::{ - receive_wal::MSG_QUEUE_SIZE, - safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn, - VoteRequest, - }, - SafeKeeperConf, +use utils::id::NodeId; +use utils::lsn::Lsn; +use utils::postgres_client::{ + ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config, }; +use crate::SafeKeeperConf; +use crate::receive_wal::{MSG_QUEUE_SIZE, REPLY_QUEUE_SIZE, WalAcceptor}; +use crate::safekeeper::{ + AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, + ProposerElected, TermHistory, TermLsn, VoteRequest, +}; +use crate::timeline::WalResidentTimeline; + /// Entrypoint for per timeline task which always runs, checking whether /// recovery for this safekeeper is needed and starting it if so. #[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))] @@ -267,7 +264,10 @@ async fn recover( ); // Now understand our term history. - let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term }); + let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: INVALID_GENERATION, + term: donor.term, + }); let vote_response = match tli .process_msg(&vote_request) .await @@ -302,10 +302,10 @@ async fn recover( // truncate WAL locally let pe = ProposerAcceptorMessage::Elected(ProposerElected { + generation: INVALID_GENERATION, term: donor.term, start_streaming_at: last_common_point.lsn, term_history: donor_th, - timeline_start_lsn: Lsn::INVALID, }); // Successful ProposerElected handling always returns None. If term changed, // we'll find out that during the streaming. Note: it is expected to get @@ -351,7 +351,9 @@ async fn recovery_stream( { Ok(client_and_conn) => client_and_conn?, Err(_elapsed) => { - bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open"); + bail!( + "timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open" + ); } }; trace!("connected to {:?}", donor); @@ -437,13 +439,12 @@ async fn network_io( match msg { ReplicationMessage::XLogData(xlog_data) => { let ar_hdr = AppendRequestHeader { + generation: INVALID_GENERATION, term: donor.term, - term_start_lsn: Lsn::INVALID, // unused begin_lsn: Lsn(xlog_data.wal_start()), end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64, commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it truncate_lsn: Lsn::INVALID, // do not attempt to advance - proposer_uuid: [0; 16], }; let ar = AppendRequest { h: ar_hdr, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index f816f8459a..886cac869d 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -1,35 +1,34 @@ //! Acceptor part of proposer-acceptor consensus algorithm. -use anyhow::{bail, Context, Result}; -use byteorder::{LittleEndian, ReadBytesExt}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; - -use postgres_ffi::{TimeLineID, MAX_SEND_SIZE}; -use safekeeper_api::models::HotStandbyFeedback; -use safekeeper_api::Term; -use serde::{Deserialize, Serialize}; -use std::cmp::max; -use std::cmp::min; +use std::cmp::{max, min}; use std::fmt; use std::io::Read; -use storage_broker::proto::SafekeeperTimelineInfo; +use std::str::FromStr; -use tracing::*; - -use crate::control_file; -use crate::metrics::MISC_OPERATION_SECONDS; - -use crate::state::TimelineState; -use crate::wal_storage; +use anyhow::{Context, Result, bail}; +use byteorder::{LittleEndian, ReadBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use postgres_ffi::{MAX_SEND_SIZE, TimeLineID}; use pq_proto::SystemId; -use utils::pageserver_feedback::PageserverFeedback; -use utils::{ - bin_ser::LeSer, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, +use safekeeper_api::membership::{ + INVALID_GENERATION, MemberSet, SafekeeperGeneration as Generation, SafekeeperId, }; +use safekeeper_api::models::HotStandbyFeedback; +use safekeeper_api::{Term, membership}; +use serde::{Deserialize, Serialize}; +use storage_broker::proto::SafekeeperTimelineInfo; +use tracing::*; +use utils::bin_ser::LeSer; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -pub const SK_PROTOCOL_VERSION: u32 = 2; +use crate::metrics::MISC_OPERATION_SECONDS; +use crate::state::TimelineState; +use crate::{control_file, wal_storage}; + +pub const SK_PROTO_VERSION_2: u32 = 2; +pub const SK_PROTO_VERSION_3: u32 = 3; pub const UNKNOWN_SERVER_VERSION: u32 = 0; #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] @@ -56,8 +55,28 @@ impl TermHistory { TermHistory(Vec::new()) } - // Parse TermHistory as n_entries followed by TermLsn pairs + // Parse TermHistory as n_entries followed by TermLsn pairs in network order. pub fn from_bytes(bytes: &mut Bytes) -> Result { + let n_entries = bytes + .get_u32_f() + .with_context(|| "TermHistory misses len")?; + let mut res = Vec::with_capacity(n_entries as usize); + for i in 0..n_entries { + let term = bytes + .get_u64_f() + .with_context(|| format!("TermHistory pos {} misses term", i))?; + let lsn = bytes + .get_u64_f() + .with_context(|| format!("TermHistory pos {} misses lsn", i))? + .into(); + res.push(TermLsn { term, lsn }) + } + Ok(TermHistory(res)) + } + + // Parse TermHistory as n_entries followed by TermLsn pairs in LE order. + // TODO remove once v2 protocol is fully dropped. + pub fn from_bytes_le(bytes: &mut Bytes) -> Result { if bytes.remaining() < 4 { bail!("TermHistory misses len"); } @@ -197,6 +216,18 @@ impl AcceptorState { /// Initial Proposer -> Acceptor message #[derive(Debug, Deserialize)] pub struct ProposerGreeting { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub mconf: membership::Configuration, + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + pub wal_seg_size: u32, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Deserialize)] +pub struct ProposerGreetingV2 { /// proposer-acceptor protocol version pub protocol_version: u32, /// Postgres server version @@ -213,39 +244,47 @@ pub struct ProposerGreeting { /// (acceptor voted for). #[derive(Debug, Serialize)] pub struct AcceptorGreeting { - term: u64, node_id: NodeId, + mconf: membership::Configuration, + term: u64, } /// Vote request sent from proposer to safekeepers -#[derive(Debug, Deserialize)] +#[derive(Debug)] pub struct VoteRequest { + pub generation: Generation, + pub term: Term, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Deserialize)] +pub struct VoteRequestV2 { pub term: Term, } /// Vote itself, sent from safekeeper to proposer #[derive(Debug, Serialize)] pub struct VoteResponse { + generation: Generation, // membership conf generation pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date. - vote_given: u64, // fixme u64 due to padding + vote_given: bool, // Safekeeper flush_lsn (end of WAL) + history of term switches allow // proposer to choose the most advanced one. pub flush_lsn: Lsn, truncate_lsn: Lsn, pub term_history: TermHistory, - timeline_start_lsn: Lsn, } /* * Proposer -> Acceptor message announcing proposer is elected and communicating * term history to it. */ -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ProposerElected { + pub generation: Generation, // membership conf generation pub term: Term, pub start_streaming_at: Lsn, pub term_history: TermHistory, - pub timeline_start_lsn: Lsn, } /// Request with WAL message sent from proposer to safekeeper. Along the way it @@ -257,6 +296,22 @@ pub struct AppendRequest { } #[derive(Debug, Clone, Deserialize)] pub struct AppendRequestHeader { + pub generation: Generation, // membership conf generation + // safekeeper's current term; if it is higher than proposer's, the compute is out of date. + pub term: Term, + /// start position of message in WAL + pub begin_lsn: Lsn, + /// end position of message in WAL + pub end_lsn: Lsn, + /// LSN committed by quorum of safekeepers + pub commit_lsn: Lsn, + /// minimal LSN which may be needed by proposer to perform recovery of some safekeeper + pub truncate_lsn: Lsn, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Clone, Deserialize)] +pub struct AppendRequestHeaderV2 { // safekeeper's current term; if it is higher than proposer's, the compute is out of date. pub term: Term, // TODO: remove this field from the protocol, it in unused -- LSN of term @@ -277,6 +332,9 @@ pub struct AppendRequestHeader { /// Report safekeeper state to proposer #[derive(Debug, Serialize, Clone)] pub struct AppendResponse { + // Membership conf generation. Not strictly required because on mismatch + // connection is reset, but let's sanity check it. + generation: Generation, // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. pub term: Term, @@ -293,8 +351,9 @@ pub struct AppendResponse { } impl AppendResponse { - fn term_only(term: Term) -> AppendResponse { + fn term_only(generation: Generation, term: Term) -> AppendResponse { AppendResponse { + generation, term, flush_lsn: Lsn(0), commit_lsn: Lsn(0), @@ -315,72 +374,322 @@ pub enum ProposerAcceptorMessage { FlushWAL, } -impl ProposerAcceptorMessage { - /// Parse proposer message. - pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result { - if proto_version != SK_PROTOCOL_VERSION { - bail!( - "incompatible protocol version {}, expected {}", - proto_version, - SK_PROTOCOL_VERSION - ); +/// Augment Bytes with fallible get_uN where N is number of bytes methods. +/// All reads are in network (big endian) order. +trait BytesF { + fn get_u8_f(&mut self) -> Result; + fn get_u16_f(&mut self) -> Result; + fn get_u32_f(&mut self) -> Result; + fn get_u64_f(&mut self) -> Result; +} + +impl BytesF for Bytes { + fn get_u8_f(&mut self) -> Result { + if self.is_empty() { + bail!("no bytes left, expected 1"); } - // xxx using Reader is inefficient but easy to work with bincode - let mut stream = msg_bytes.reader(); - // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is - let tag = stream.read_u64::()? as u8 as char; - match tag { - 'g' => { - let msg = ProposerGreeting::des_from(&mut stream)?; - Ok(ProposerAcceptorMessage::Greeting(msg)) - } - 'v' => { - let msg = VoteRequest::des_from(&mut stream)?; - Ok(ProposerAcceptorMessage::VoteRequest(msg)) - } - 'e' => { - let mut msg_bytes = stream.into_inner(); - if msg_bytes.remaining() < 16 { - bail!("ProposerElected message is not complete"); - } - let term = msg_bytes.get_u64_le(); - let start_streaming_at = msg_bytes.get_u64_le().into(); - let term_history = TermHistory::from_bytes(&mut msg_bytes)?; - if msg_bytes.remaining() < 8 { - bail!("ProposerElected message is not complete"); - } - let timeline_start_lsn = msg_bytes.get_u64_le().into(); - let msg = ProposerElected { - term, - start_streaming_at, - timeline_start_lsn, - term_history, + Ok(self.get_u8()) + } + fn get_u16_f(&mut self) -> Result { + if self.remaining() < 2 { + bail!("no bytes left, expected 2"); + } + Ok(self.get_u16()) + } + fn get_u32_f(&mut self) -> Result { + if self.remaining() < 4 { + bail!("only {} bytes left, expected 4", self.remaining()); + } + Ok(self.get_u32()) + } + fn get_u64_f(&mut self) -> Result { + if self.remaining() < 8 { + bail!("only {} bytes left, expected 8", self.remaining()); + } + Ok(self.get_u64()) + } +} + +impl ProposerAcceptorMessage { + /// Read cstring from Bytes. + fn get_cstr(buf: &mut Bytes) -> Result { + let pos = buf + .iter() + .position(|x| *x == 0) + .ok_or_else(|| anyhow::anyhow!("missing cstring terminator"))?; + let result = buf.split_to(pos); + buf.advance(1); // drop the null terminator + match std::str::from_utf8(&result) { + Ok(s) => Ok(s.to_string()), + Err(e) => bail!("invalid utf8 in cstring: {}", e), + } + } + + /// Read membership::Configuration from Bytes. + fn get_mconf(buf: &mut Bytes) -> Result { + let generation = Generation::new(buf.get_u32_f().with_context(|| "reading generation")?); + let members_len = buf.get_u32_f().with_context(|| "reading members_len")?; + // Main member set must have at least someone in valid configuration. + // Empty conf is allowed until we fully migrate. + if generation != INVALID_GENERATION && members_len == 0 { + bail!("empty members_len"); + } + let mut members = MemberSet::empty(); + for i in 0..members_len { + let id = buf + .get_u64_f() + .with_context(|| format!("reading member {} node_id", i))?; + let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?; + let pg_port = buf + .get_u16_f() + .with_context(|| format!("reading member {} port", i))?; + let sk = SafekeeperId { + id: NodeId(id), + host, + pg_port, + }; + members.add(sk)?; + } + let new_members_len = buf.get_u32_f().with_context(|| "reading new_members_len")?; + // Non joint conf. + if new_members_len == 0 { + Ok(membership::Configuration { + generation, + members, + new_members: None, + }) + } else { + let mut new_members = MemberSet::empty(); + for i in 0..new_members_len { + let id = buf + .get_u64_f() + .with_context(|| format!("reading new member {} node_id", i))?; + let host = Self::get_cstr(buf) + .with_context(|| format!("reading new member {} host", i))?; + let pg_port = buf + .get_u16_f() + .with_context(|| format!("reading new member {} port", i))?; + let sk = SafekeeperId { + id: NodeId(id), + host, + pg_port, }; - Ok(ProposerAcceptorMessage::Elected(msg)) + new_members.add(sk)?; } - 'a' => { - // read header followed by wal data - let hdr = AppendRequestHeader::des_from(&mut stream)?; - let rec_size = hdr - .end_lsn - .checked_sub(hdr.begin_lsn) - .context("begin_lsn > end_lsn in AppendRequest")? - .0 as usize; - if rec_size > MAX_SEND_SIZE { - bail!( - "AppendRequest is longer than MAX_SEND_SIZE ({})", - MAX_SEND_SIZE - ); + Ok(membership::Configuration { + generation, + members, + new_members: Some(new_members), + }) + } + } + + /// Parse proposer message. + pub fn parse(mut msg_bytes: Bytes, proto_version: u32) -> Result { + if proto_version == SK_PROTO_VERSION_3 { + if msg_bytes.is_empty() { + bail!("ProposerAcceptorMessage is not complete: missing tag"); + } + let tag = msg_bytes.get_u8_f().with_context(|| { + "ProposerAcceptorMessage is not complete: missing tag".to_string() + })? as char; + match tag { + 'g' => { + let tenant_id_str = + Self::get_cstr(&mut msg_bytes).with_context(|| "reading tenant_id")?; + let tenant_id = TenantId::from_str(&tenant_id_str)?; + let timeline_id_str = + Self::get_cstr(&mut msg_bytes).with_context(|| "reading timeline_id")?; + let timeline_id = TimelineId::from_str(&timeline_id_str)?; + let mconf = Self::get_mconf(&mut msg_bytes)?; + let pg_version = msg_bytes + .get_u32_f() + .with_context(|| "reading pg_version")?; + let system_id = msg_bytes.get_u64_f().with_context(|| "reading system_id")?; + let wal_seg_size = msg_bytes + .get_u32_f() + .with_context(|| "reading wal_seg_size")?; + let g = ProposerGreeting { + tenant_id, + timeline_id, + mconf, + pg_version, + system_id, + wal_seg_size, + }; + Ok(ProposerAcceptorMessage::Greeting(g)) } + 'v' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let v = VoteRequest { generation, term }; + Ok(ProposerAcceptorMessage::VoteRequest(v)) + } + 'e' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let start_streaming_at: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading start_streaming_at")? + .into(); + let term_history = TermHistory::from_bytes(&mut msg_bytes)?; + let msg = ProposerElected { + generation, + term, + start_streaming_at, + term_history, + }; + Ok(ProposerAcceptorMessage::Elected(msg)) + } + 'a' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let begin_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading begin_lsn")? + .into(); + let end_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading end_lsn")? + .into(); + let commit_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading commit_lsn")? + .into(); + let truncate_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading truncate_lsn")? + .into(); + let hdr = AppendRequestHeader { + generation, + term, + begin_lsn, + end_lsn, + commit_lsn, + truncate_lsn, + }; + let rec_size = hdr + .end_lsn + .checked_sub(hdr.begin_lsn) + .context("begin_lsn > end_lsn in AppendRequest")? + .0 as usize; + if rec_size > MAX_SEND_SIZE { + bail!( + "AppendRequest is longer than MAX_SEND_SIZE ({})", + MAX_SEND_SIZE + ); + } + if msg_bytes.remaining() < rec_size { + bail!( + "reading WAL: only {} bytes left, wanted {}", + msg_bytes.remaining(), + rec_size + ); + } + let wal_data = msg_bytes.copy_to_bytes(rec_size); + let msg = AppendRequest { h: hdr, wal_data }; - let mut wal_data_vec: Vec = vec![0; rec_size]; - stream.read_exact(&mut wal_data_vec)?; - let wal_data = Bytes::from(wal_data_vec); - let msg = AppendRequest { h: hdr, wal_data }; - - Ok(ProposerAcceptorMessage::AppendRequest(msg)) + Ok(ProposerAcceptorMessage::AppendRequest(msg)) + } + _ => bail!("unknown proposer-acceptor message tag: {}", tag), } - _ => bail!("unknown proposer-acceptor message tag: {}", tag), + } else if proto_version == SK_PROTO_VERSION_2 { + // xxx using Reader is inefficient but easy to work with bincode + let mut stream = msg_bytes.reader(); + // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is + let tag = stream.read_u64::()? as u8 as char; + match tag { + 'g' => { + let msgv2 = ProposerGreetingV2::des_from(&mut stream)?; + let g = ProposerGreeting { + tenant_id: msgv2.tenant_id, + timeline_id: msgv2.timeline_id, + mconf: membership::Configuration { + generation: INVALID_GENERATION, + members: MemberSet::empty(), + new_members: None, + }, + pg_version: msgv2.pg_version, + system_id: msgv2.system_id, + wal_seg_size: msgv2.wal_seg_size, + }; + Ok(ProposerAcceptorMessage::Greeting(g)) + } + 'v' => { + let msg = VoteRequestV2::des_from(&mut stream)?; + let v = VoteRequest { + generation: INVALID_GENERATION, + term: msg.term, + }; + Ok(ProposerAcceptorMessage::VoteRequest(v)) + } + 'e' => { + let mut msg_bytes = stream.into_inner(); + if msg_bytes.remaining() < 16 { + bail!("ProposerElected message is not complete"); + } + let term = msg_bytes.get_u64_le(); + let start_streaming_at = msg_bytes.get_u64_le().into(); + let term_history = TermHistory::from_bytes_le(&mut msg_bytes)?; + if msg_bytes.remaining() < 8 { + bail!("ProposerElected message is not complete"); + } + let _timeline_start_lsn = msg_bytes.get_u64_le(); + let msg = ProposerElected { + generation: INVALID_GENERATION, + term, + start_streaming_at, + term_history, + }; + Ok(ProposerAcceptorMessage::Elected(msg)) + } + 'a' => { + // read header followed by wal data + let hdrv2 = AppendRequestHeaderV2::des_from(&mut stream)?; + let hdr = AppendRequestHeader { + generation: INVALID_GENERATION, + term: hdrv2.term, + begin_lsn: hdrv2.begin_lsn, + end_lsn: hdrv2.end_lsn, + commit_lsn: hdrv2.commit_lsn, + truncate_lsn: hdrv2.truncate_lsn, + }; + let rec_size = hdr + .end_lsn + .checked_sub(hdr.begin_lsn) + .context("begin_lsn > end_lsn in AppendRequest")? + .0 as usize; + if rec_size > MAX_SEND_SIZE { + bail!( + "AppendRequest is longer than MAX_SEND_SIZE ({})", + MAX_SEND_SIZE + ); + } + + let mut wal_data_vec: Vec = vec![0; rec_size]; + stream.read_exact(&mut wal_data_vec)?; + let wal_data = Bytes::from(wal_data_vec); + + let msg = AppendRequest { h: hdr, wal_data }; + + Ok(ProposerAcceptorMessage::AppendRequest(msg)) + } + _ => bail!("unknown proposer-acceptor message tag: {}", tag), + } + } else { + bail!("unsupported protocol version {}", proto_version); } } @@ -394,36 +703,21 @@ impl ProposerAcceptorMessage { // We explicitly list all fields, to draw attention here when new fields are added. let mut size = BASE_SIZE; size += match self { - Self::Greeting(ProposerGreeting { - protocol_version: _, - pg_version: _, - proposer_id: _, - system_id: _, - timeline_id: _, - tenant_id: _, - tli: _, - wal_seg_size: _, - }) => 0, + Self::Greeting(_) => 0, - Self::VoteRequest(VoteRequest { term: _ }) => 0, + Self::VoteRequest(_) => 0, - Self::Elected(ProposerElected { - term: _, - start_streaming_at: _, - term_history: _, - timeline_start_lsn: _, - }) => 0, + Self::Elected(_) => 0, Self::AppendRequest(AppendRequest { h: AppendRequestHeader { + generation: _, term: _, - term_start_lsn: _, begin_lsn: _, end_lsn: _, commit_lsn: _, truncate_lsn: _, - proposer_uuid: _, }, wal_data, }) => wal_data.len(), @@ -431,13 +725,12 @@ impl ProposerAcceptorMessage { Self::NoFlushAppendRequest(AppendRequest { h: AppendRequestHeader { + generation: _, term: _, - term_start_lsn: _, begin_lsn: _, end_lsn: _, commit_lsn: _, truncate_lsn: _, - proposer_uuid: _, }, wal_data, }) => wal_data.len(), @@ -458,45 +751,118 @@ pub enum AcceptorProposerMessage { } impl AcceptorProposerMessage { - /// Serialize acceptor -> proposer message. - pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { - match self { - AcceptorProposerMessage::Greeting(msg) => { - buf.put_u64_le('g' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.node_id.0); - } - AcceptorProposerMessage::VoteResponse(msg) => { - buf.put_u64_le('v' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.vote_given); - buf.put_u64_le(msg.flush_lsn.into()); - buf.put_u64_le(msg.truncate_lsn.into()); - buf.put_u32_le(msg.term_history.0.len() as u32); - for e in &msg.term_history.0 { - buf.put_u64_le(e.term); - buf.put_u64_le(e.lsn.into()); - } - buf.put_u64_le(msg.timeline_start_lsn.into()); - } - AcceptorProposerMessage::AppendResponse(msg) => { - buf.put_u64_le('a' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.flush_lsn.into()); - buf.put_u64_le(msg.commit_lsn.into()); - buf.put_i64_le(msg.hs_feedback.ts); - buf.put_u64_le(msg.hs_feedback.xmin); - buf.put_u64_le(msg.hs_feedback.catalog_xmin); + fn put_cstr(buf: &mut BytesMut, s: &str) { + buf.put_slice(s.as_bytes()); + buf.put_u8(0); // null terminator + } - // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback - // if it is not present. - if let Some(ref msg) = msg.pageserver_feedback { - msg.serialize(buf); - } - } + /// Serialize membership::Configuration into buf. + fn serialize_mconf(buf: &mut BytesMut, mconf: &membership::Configuration) { + buf.put_u32(mconf.generation.into_inner()); + buf.put_u32(mconf.members.m.len() as u32); + for sk in &mconf.members.m { + buf.put_u64(sk.id.0); + Self::put_cstr(buf, &sk.host); + buf.put_u16(sk.pg_port); } + if let Some(ref new_members) = mconf.new_members { + buf.put_u32(new_members.m.len() as u32); + for sk in &new_members.m { + buf.put_u64(sk.id.0); + Self::put_cstr(buf, &sk.host); + buf.put_u16(sk.pg_port); + } + } else { + buf.put_u32(0); + } + } - Ok(()) + /// Serialize acceptor -> proposer message. + pub fn serialize(&self, buf: &mut BytesMut, proto_version: u32) -> Result<()> { + if proto_version == SK_PROTO_VERSION_3 { + match self { + AcceptorProposerMessage::Greeting(msg) => { + buf.put_u8(b'g'); + buf.put_u64(msg.node_id.0); + Self::serialize_mconf(buf, &msg.mconf); + buf.put_u64(msg.term) + } + AcceptorProposerMessage::VoteResponse(msg) => { + buf.put_u8(b'v'); + buf.put_u32(msg.generation.into_inner()); + buf.put_u64(msg.term); + buf.put_u8(msg.vote_given as u8); + buf.put_u64(msg.flush_lsn.into()); + buf.put_u64(msg.truncate_lsn.into()); + buf.put_u32(msg.term_history.0.len() as u32); + for e in &msg.term_history.0 { + buf.put_u64(e.term); + buf.put_u64(e.lsn.into()); + } + } + AcceptorProposerMessage::AppendResponse(msg) => { + buf.put_u8(b'a'); + buf.put_u32(msg.generation.into_inner()); + buf.put_u64(msg.term); + buf.put_u64(msg.flush_lsn.into()); + buf.put_u64(msg.commit_lsn.into()); + buf.put_i64(msg.hs_feedback.ts); + buf.put_u64(msg.hs_feedback.xmin); + buf.put_u64(msg.hs_feedback.catalog_xmin); + + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } + } + } + Ok(()) + // TODO remove 3 after converting all msgs + } else if proto_version == SK_PROTO_VERSION_2 { + match self { + AcceptorProposerMessage::Greeting(msg) => { + buf.put_u64_le('g' as u64); + // v2 didn't have mconf and fields were reordered + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.node_id.0); + } + AcceptorProposerMessage::VoteResponse(msg) => { + // v2 didn't have generation, had u64 vote_given and timeline_start_lsn + buf.put_u64_le('v' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.vote_given as u64); + buf.put_u64_le(msg.flush_lsn.into()); + buf.put_u64_le(msg.truncate_lsn.into()); + buf.put_u32_le(msg.term_history.0.len() as u32); + for e in &msg.term_history.0 { + buf.put_u64_le(e.term); + buf.put_u64_le(e.lsn.into()); + } + // removed timeline_start_lsn + buf.put_u64_le(0); + } + AcceptorProposerMessage::AppendResponse(msg) => { + // v2 didn't have generation + buf.put_u64_le('a' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.flush_lsn.into()); + buf.put_u64_le(msg.commit_lsn.into()); + buf.put_i64_le(msg.hs_feedback.ts); + buf.put_u64_le(msg.hs_feedback.xmin); + buf.put_u64_le(msg.hs_feedback.catalog_xmin); + + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } + } + } + Ok(()) + } else { + bail!("unsupported protocol version {}", proto_version); + } } } @@ -593,14 +959,6 @@ where &mut self, msg: &ProposerGreeting, ) -> Result> { - // Check protocol compatibility - if msg.protocol_version != SK_PROTOCOL_VERSION { - bail!( - "incompatible protocol version {}, expected {}", - msg.protocol_version, - SK_PROTOCOL_VERSION - ); - } /* Postgres major version mismatch is treated as fatal error * because safekeepers parse WAL headers and the format * may change between versions. @@ -655,15 +1013,19 @@ where self.state.finish_change(&state).await?; } - info!( - "processed greeting from walproposer {}, sending term {:?}", - msg.proposer_id.map(|b| format!("{:X}", b)).join(""), - self.state.acceptor_state.term - ); - Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { - term: self.state.acceptor_state.term, + // Switch into conf given by proposer conf if it is higher. + self.state.membership_switch(msg.mconf.clone()).await?; + + let apg = AcceptorGreeting { node_id: self.node_id, - }))) + mconf: self.state.mconf.clone(), + term: self.state.acceptor_state.term, + }; + info!( + "processed greeting {:?} from walproposer, sending {:?}", + msg, apg + ); + Ok(Some(AcceptorProposerMessage::Greeting(apg))) } /// Give vote for the given term, if we haven't done that previously. @@ -671,25 +1033,27 @@ where &mut self, msg: &VoteRequest, ) -> Result> { + if self.state.mconf.generation != msg.generation { + bail!( + "refusing {:?} due to generation mismatch: sk generation {}", + msg, + self.state.mconf.generation + ); + } // Once voted, we won't accept data from older proposers; flush // everything we've already received so that new proposer starts - // streaming at end of our WAL, without overlap. Currently we truncate - // WAL at streaming point, so this avoids truncating already committed - // WAL. - // - // TODO: it would be smoother to not truncate committed piece at - // handle_elected instead. Currently not a big deal, as proposer is the - // only source of WAL; with peer2peer recovery it would be more - // important. + // streaming at end of our WAL, without overlap. WAL is truncated at + // streaming point and commit_lsn may be advanced from peers, so this + // also avoids possible spurious attempt to truncate committed WAL. self.wal_store.flush_wal().await?; // initialize with refusal let mut resp = VoteResponse { + generation: self.state.mconf.generation, term: self.state.acceptor_state.term, - vote_given: false as u64, + vote_given: false, flush_lsn: self.flush_lsn(), truncate_lsn: self.state.inmem.peer_horizon_lsn, term_history: self.get_term_history(), - timeline_start_lsn: self.state.timeline_start_lsn, }; if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); @@ -698,15 +1062,16 @@ where self.state.finish_change(&state).await?; resp.term = self.state.acceptor_state.term; - resp.vote_given = true as u64; + resp.vote_given = true; } - info!("processed VoteRequest for term {}: {:?}", msg.term, &resp); + info!("processed {:?}: sending {:?}", msg, &resp); Ok(Some(AcceptorProposerMessage::VoteResponse(resp))) } /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { let ar = AppendResponse { + generation: self.state.mconf.generation, term: self.state.acceptor_state.term, flush_lsn: self.flush_lsn(), commit_lsn: self.state.commit_lsn, @@ -733,6 +1098,13 @@ where self.get_last_log_term(), self.flush_lsn() ); + if self.state.mconf.generation != msg.generation { + bail!( + "refusing {:?} due to generation mismatch: sk generation {}", + msg, + self.state.mconf.generation + ); + } if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); state.acceptor_state.term = msg.term; @@ -769,9 +1141,14 @@ where // and walproposer recalculates the streaming point. OTOH repeating // error indicates a serious bug. if last_common_point.lsn != msg.start_streaming_at { - bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", - last_common_point, msg.start_streaming_at, - self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + bail!( + "refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", + last_common_point, + msg.start_streaming_at, + self.state.acceptor_state.term, + sk_th, + self.flush_lsn(), + msg.term_history, ); } @@ -779,8 +1156,12 @@ where assert!( msg.start_streaming_at >= self.state.inmem.commit_lsn, "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", - msg.start_streaming_at, self.state.inmem.commit_lsn, - self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + msg.start_streaming_at, + self.state.inmem.commit_lsn, + self.state.acceptor_state.term, + sk_th, + self.flush_lsn(), + msg.term_history, ); // Before first WAL write initialize its segment. It makes first segment @@ -805,18 +1186,22 @@ where // Here we learn initial LSN for the first time, set fields // interested in that. - if state.timeline_start_lsn == Lsn(0) { - // Remember point where WAL begins globally. - state.timeline_start_lsn = msg.timeline_start_lsn; - info!( - "setting timeline_start_lsn to {:?}", - state.timeline_start_lsn - ); + if let Some(start_lsn) = msg.term_history.0.first() { + if state.timeline_start_lsn == Lsn(0) { + // Remember point where WAL begins globally. In the future it + // will be intialized immediately on timeline creation. + state.timeline_start_lsn = start_lsn.lsn; + info!( + "setting timeline_start_lsn to {:?}", + state.timeline_start_lsn + ); + } } + if state.peer_horizon_lsn == Lsn(0) { // Update peer_horizon_lsn as soon as we know where timeline starts. // It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn. - state.peer_horizon_lsn = msg.timeline_start_lsn; + state.peer_horizon_lsn = state.timeline_start_lsn; } if state.local_start_lsn == Lsn(0) { state.local_start_lsn = msg.start_streaming_at; @@ -890,13 +1275,29 @@ where msg: &AppendRequest, require_flush: bool, ) -> Result> { + // Refuse message on generation mismatch. On reconnect wp will get full + // configuration from greeting. + if self.state.mconf.generation != msg.h.generation { + bail!( + "refusing append request due to generation mismatch: request {}, sk {}", + msg.h.generation, + self.state.mconf.generation + ); + } + if self.state.acceptor_state.term < msg.h.term { bail!("got AppendRequest before ProposerElected"); } - // If our term is higher, immediately refuse the message. + // If our term is higher, immediately refuse the message. Send term only + // response; elected walproposer can never advance the term, so it will + // figure out the refusal from it -- which is important as term change + // should cause not just reconnection but whole walproposer re-election. if self.state.acceptor_state.term > msg.h.term { - let resp = AppendResponse::term_only(self.state.acceptor_state.term); + let resp = AppendResponse::term_only( + self.state.mconf.generation, + self.state.acceptor_state.term, + ); return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); } @@ -924,10 +1325,8 @@ where ); } - // Now we know that we are in the same term as the proposer, - // processing the message. - - self.state.inmem.proposer_uuid = msg.h.proposer_uuid; + // Now we know that we are in the same term as the proposer, process the + // message. // do the job if !msg.wal_data.is_empty() { @@ -1000,21 +1399,19 @@ where #[cfg(test)] mod tests { - use futures::future::BoxFuture; + use std::ops::Deref; + use std::str::FromStr; + use std::time::{Instant, UNIX_EPOCH}; - use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; - use safekeeper_api::{ - membership::{Configuration, MemberSet, SafekeeperGeneration, SafekeeperId}, - ServerInfo, + use futures::future::BoxFuture; + use postgres_ffi::{WAL_SEGMENT_SIZE, XLogSegNo}; + use safekeeper_api::ServerInfo; + use safekeeper_api::membership::{ + Configuration, MemberSet, SafekeeperGeneration, SafekeeperId, }; use super::*; use crate::state::{EvictionState, TimelinePersistentState}; - use std::{ - ops::Deref, - str::FromStr, - time::{Instant, UNIX_EPOCH}, - }; // fake storage for tests struct InMemoryState { @@ -1096,11 +1493,21 @@ mod tests { let wal_store = DummyWalStore { lsn: Lsn(0) }; let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); + // Vote with generation mismatch should be rejected. + let gen_mismatch_vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: SafekeeperGeneration::new(42), + term: 1, + }); + assert!(sk.process_msg(&gen_mismatch_vote_request).await.is_err()); + // check voting for 1 is ok - let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); + let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: Generation::new(0), + term: 1, + }); let mut vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { - Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0), + Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given), r => panic!("unexpected response: {:?}", r), } @@ -1115,7 +1522,7 @@ mod tests { // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { - Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0), + Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(!resp.vote_given), r => panic!("unexpected response: {:?}", r), } } @@ -1130,13 +1537,12 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { + generation: Generation::new(0), term: 2, - term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }; let mut append_request = AppendRequest { h: ar_hdr.clone(), @@ -1144,6 +1550,7 @@ mod tests { }; let pem = ProposerElected { + generation: Generation::new(0), term: 2, start_streaming_at: Lsn(1), term_history: TermHistory(vec![ @@ -1156,8 +1563,17 @@ mod tests { lsn: Lsn(3), }, ]), - timeline_start_lsn: Lsn(1), }; + + // check that elected msg with generation mismatch is rejected + let mut pem_gen_mismatch = pem.clone(); + pem_gen_mismatch.generation = SafekeeperGeneration::new(42); + assert!( + sk.process_msg(&ProposerAcceptorMessage::Elected(pem_gen_mismatch)) + .await + .is_err() + ); + sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await .unwrap(); @@ -1191,32 +1607,46 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let pem = ProposerElected { + generation: Generation::new(0), term: 1, start_streaming_at: Lsn(1), term_history: TermHistory(vec![TermLsn { term: 1, lsn: Lsn(1), }]), - timeline_start_lsn: Lsn(1), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await .unwrap(); let ar_hdr = AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }; let append_request = AppendRequest { h: ar_hdr.clone(), wal_data: Bytes::from_static(b"b"), }; + // check that append request with generation mismatch is rejected + let mut ar_hdr_gen_mismatch = ar_hdr.clone(); + ar_hdr_gen_mismatch.generation = SafekeeperGeneration::new(42); + let append_request_gen_mismatch = AppendRequest { + h: ar_hdr_gen_mismatch, + wal_data: Bytes::from_static(b"b"), + }; + assert!( + sk.process_msg(&ProposerAcceptorMessage::AppendRequest( + append_request_gen_mismatch + )) + .await + .is_err() + ); + // do write ending at 2, it should be ok sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) .await diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index fb06339604..2b1fd7b854 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -3,23 +3,22 @@ use std::fmt::Display; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context}; -use futures::future::Either; +use anyhow::{Context, anyhow}; use futures::StreamExt; +use futures::future::Either; use pageserver_api::shard::ShardIdentity; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend}; -use postgres_ffi::waldecoder::WalDecodeError; -use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder}; +use postgres_ffi::get_current_timestamp; +use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendError; use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; -use tracing::{error, info, info_span, Instrument}; +use tracing::{Instrument, error, info, info_span}; use utils::critical; use utils::lsn::Lsn; -use utils::postgres_client::Compression; -use utils::postgres_client::InterpretedFormat; +use utils::postgres_client::{Compression, InterpretedFormat}; use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords}; use wal_decoder::wire_format::ToWireFormat; @@ -100,7 +99,12 @@ struct ShardSenderState { /// State of [`InterpretedWalReader`] visible outside of the task running it. #[derive(Debug)] pub(crate) enum InterpretedWalReaderState { - Running { current_position: Lsn }, + Running { + current_position: Lsn, + /// Tracks the start of the PG WAL LSN from which the current batch of + /// interpreted records originated. + current_batch_wal_start: Option, + }, Done, } @@ -122,14 +126,21 @@ pub enum InterpretedWalReaderError { } enum CurrentPositionUpdate { - Reset(Lsn), + Reset { from: Lsn, to: Lsn }, NotReset(Lsn), } impl CurrentPositionUpdate { fn current_position(&self) -> Lsn { match self { - CurrentPositionUpdate::Reset(lsn) => *lsn, + CurrentPositionUpdate::Reset { from: _, to } => *to, + CurrentPositionUpdate::NotReset(lsn) => *lsn, + } + } + + fn previous_position(&self) -> Lsn { + match self { + CurrentPositionUpdate::Reset { from, to: _ } => *from, CurrentPositionUpdate::NotReset(lsn) => *lsn, } } @@ -145,17 +156,44 @@ impl InterpretedWalReaderState { } } + #[cfg(test)] + fn current_batch_wal_start(&self) -> Option { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => *current_batch_wal_start, + InterpretedWalReaderState::Done => None, + } + } + // Reset the current position of the WAL reader if the requested starting position // of the new shard is smaller than the current value. fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate { match self { InterpretedWalReaderState::Running { - current_position, .. + current_position, + current_batch_wal_start, } => { if new_shard_start_pos < *current_position { + let from = *current_position; *current_position = new_shard_start_pos; - CurrentPositionUpdate::Reset(*current_position) + *current_batch_wal_start = None; + CurrentPositionUpdate::Reset { + from, + to: *current_position, + } } else { + // Edge case: The new shard is at the same current position as + // the reader. Note that the current position is WAL record aligned, + // so the reader might have done some partial reads and updated the + // batch start. If that's the case, adjust the batch start to match + // starting position of the new shard. It can lead to some shards + // seeing overlaps, but in that case the actual record LSNs are checked + // which should be fine based on the filtering logic. + if let Some(start) = current_batch_wal_start { + *start = std::cmp::min(*start, new_shard_start_pos); + } CurrentPositionUpdate::NotReset(*current_position) } } @@ -164,6 +202,47 @@ impl InterpretedWalReaderState { } } } + + fn update_current_batch_wal_start(&mut self, lsn: Lsn) { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => { + if current_batch_wal_start.is_none() { + *current_batch_wal_start = Some(lsn); + } + } + InterpretedWalReaderState::Done => { + panic!("update_current_batch_wal_start called on finished reader") + } + } + } + + fn replace_current_batch_wal_start(&mut self, with: Lsn) -> Lsn { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => current_batch_wal_start.replace(with).unwrap(), + InterpretedWalReaderState::Done => { + panic!("take_current_batch_wal_start called on finished reader") + } + } + } + + fn update_current_position(&mut self, lsn: Lsn) { + match self { + InterpretedWalReaderState::Running { + current_position, .. + } => { + *current_position = lsn; + } + InterpretedWalReaderState::Done => { + panic!("update_current_position called on finished reader") + } + } + } } pub(crate) struct AttachShardNotification { @@ -184,6 +263,7 @@ impl InterpretedWalReader { ) -> InterpretedWalReaderHandle { let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { current_position: start_pos, + current_batch_wal_start: None, })); let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel(); @@ -217,7 +297,13 @@ impl InterpretedWalReader { reader .run_impl(start_pos) .await - .inspect_err(|err| critical!("failed to read WAL record: {err:?}")) + .inspect_err(|err| match err { + // TODO: we may want to differentiate these errors further. + InterpretedWalReaderError::Decode(_) => { + critical!("failed to decode WAL record: {err:?}"); + } + err => error!("failed to read WAL record: {err}"), + }) } .instrument(info_span!("interpreted wal reader")), ); @@ -237,9 +323,13 @@ impl InterpretedWalReader { tx: tokio::sync::mpsc::Sender, shard: ShardIdentity, pg_version: u32, + shard_notification_rx: Option< + tokio::sync::mpsc::UnboundedReceiver, + >, ) -> InterpretedWalReader { let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { current_position: start_pos, + current_batch_wal_start: None, })); InterpretedWalReader { @@ -252,7 +342,7 @@ impl InterpretedWalReader { next_record_lsn: start_pos, }], )]), - shard_notification_rx: None, + shard_notification_rx, state: state.clone(), pg_version, } @@ -273,10 +363,12 @@ impl InterpretedWalReader { metric.dec(); } - if let Err(err) = self.run_impl(start_pos).await { - critical!("failed to read WAL record: {err:?}"); - } else { - info!("interpreted wal reader exiting"); + match self.run_impl(start_pos).await { + Err(err @ InterpretedWalReaderError::Decode(_)) => { + critical!("failed to decode WAL record: {err:?}"); + } + Err(err) => error!("failed to read WAL record: {err}"), + Ok(()) => info!("interpreted wal reader exiting"), } Err(CopyStreamHandlerEnd::Other(anyhow!( @@ -295,10 +387,6 @@ impl InterpretedWalReader { let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version); - // Tracks the start of the PG WAL LSN from which the current batch of - // interpreted records originated. - let mut current_batch_wal_start_lsn: Option = None; - loop { tokio::select! { // Main branch for reading WAL and forwarding it @@ -319,11 +407,7 @@ impl InterpretedWalReader { } }; - // We will already have a value if the previous chunks of WAL - // did not decode into anything useful. - if current_batch_wal_start_lsn.is_none() { - current_batch_wal_start_lsn = Some(wal_start_lsn); - } + self.state.write().unwrap().update_current_batch_wal_start(wal_start_lsn); wal_decoder.feed_bytes(&wal); @@ -332,10 +416,12 @@ impl InterpretedWalReader { let shard_ids = self.shard_senders.keys().copied().collect::>(); let mut records_by_sender: HashMap> = HashMap::new(); let mut max_next_record_lsn = None; + let mut max_end_record_lsn = None; while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()? { assert!(next_record_lsn.is_aligned()); max_next_record_lsn = Some(next_record_lsn); + max_end_record_lsn = Some(wal_decoder.lsn()); let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, @@ -346,7 +432,10 @@ impl InterpretedWalReader { .with_context(|| "Failed to interpret WAL")?; for (shard, record) in interpreted { - if record.is_empty() { + // Shard zero needs to track the start LSN of the latest record + // in adition to the LSN of the next record to ingest. The former + // is included in basebackup persisted by the compute in WAL. + if !shard.is_shard_zero() && record.is_empty() { continue; } @@ -380,16 +469,11 @@ impl InterpretedWalReader { // Update the current position such that new receivers can decide // whether to attach to us or spawn a new WAL reader. - match &mut *self.state.write().unwrap() { - InterpretedWalReaderState::Running { current_position, .. } => { - *current_position = max_next_record_lsn; - }, - InterpretedWalReaderState::Done => { - unreachable!() - } - } - - let batch_wal_start_lsn = current_batch_wal_start_lsn.take().unwrap(); + let batch_wal_start_lsn = { + let mut guard = self.state.write().unwrap(); + guard.update_current_position(max_next_record_lsn); + guard.replace_current_batch_wal_start(max_end_record_lsn.unwrap()) + }; // Send interpreted records downstream. Anything that has already been seen // by a shard is filtered out. @@ -480,7 +564,7 @@ impl InterpretedWalReader { // anything outside the select statement. let position_reset = self.state.write().unwrap().maybe_reset(start_pos); match position_reset { - CurrentPositionUpdate::Reset(to) => { + CurrentPositionUpdate::Reset { from: _, to } => { self.wal_stream.reset(to).await; wal_decoder = WalStreamDecoder::new(to, self.pg_version); }, @@ -488,14 +572,22 @@ impl InterpretedWalReader { }; tracing::info!( - "Added shard sender {} with start_pos={} current_pos={}", - ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position() + "Added shard sender {} with start_pos={} previous_pos={} current_pos={}", + ShardSenderId::new(shard_id, new_sender_id), + start_pos, + position_reset.previous_position(), + position_reset.current_position(), ); } } } } } + + #[cfg(test)] + fn state(&self) -> Arc> { + self.state.clone() + } } impl InterpretedWalReaderHandle { @@ -621,22 +713,20 @@ impl InterpretedWalSender<'_, IO> { } #[cfg(test)] mod tests { - use std::{collections::HashMap, str::FromStr, time::Duration}; + use std::collections::HashMap; + use std::str::FromStr; + use std::time::Duration; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; use postgres_ffi::MAX_SEND_SIZE; use tokio::sync::mpsc::error::TryRecvError; - use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, - shard::{ShardCount, ShardNumber}, - }; + use utils::id::{NodeId, TenantTimelineId}; + use utils::lsn::Lsn; + use utils::shard::{ShardCount, ShardNumber}; - use crate::{ - send_interpreted_wal::{Batch, InterpretedWalReader}, - test_utils::Env, - wal_reader_stream::StreamingWalReader, - }; + use crate::send_interpreted_wal::{AttachShardNotification, Batch, InterpretedWalReader}; + use crate::test_utils::Env; + use crate::wal_reader_stream::StreamingWalReader; #[tokio::test] async fn test_interpreted_wal_reader_fanout() { @@ -655,7 +745,7 @@ mod tests { .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, c"neon-file:", None) .await .unwrap(); let end_pos = end_watch.get(); @@ -738,9 +828,11 @@ mod tests { // This test uses logical messages. Those only go to shard 0. Check that the // filtering worked and shard 1 did not get any. - assert!(shard_1_interpreted_records - .iter() - .all(|recs| recs.records.is_empty())); + assert!( + shard_1_interpreted_records + .iter() + .all(|recs| recs.records.is_empty()) + ); // Shard 0 should not receive anything more since the reader is // going through wal that it has already processed. @@ -796,10 +888,16 @@ mod tests { let resident_tli = tli.wal_residence_guard().await.unwrap(); let mut next_record_lsns = Vec::default(); - let end_watch = - Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns)) - .await - .unwrap(); + let end_watch = Env::write_wal( + tli, + start_lsn, + SIZE, + MSG_COUNT, + c"neon-file:", + Some(&mut next_record_lsns), + ) + .await + .unwrap(); let end_pos = end_watch.get(); let streaming_wal_reader = StreamingWalReader::new( @@ -913,4 +1011,218 @@ mod tests { assert_eq!(sender.received_next_record_lsns, expected); } } + + #[tokio::test] + async fn test_batch_start_tracking_on_reset() { + // When the WAL stream is reset to an older LSN, + // the current batch start LSN should be invalidated. + // This test constructs such a scenario: + // 1. Shard 0 is reading somewhere ahead + // 2. Reader reads some WAL, but does not decode a full record (partial read) + // 3. Shard 1 attaches to the reader and resets it to an older LSN + // 4. Shard 1 should get the correct batch WAL start LSN + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 64 * 1024; + const MSG_COUNT: usize = 10; + const PG_VERSION: u32 = 17; + const SHARD_COUNT: u8 = 2; + const WAL_READER_BATCH_SIZE: usize = 8192; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let mut next_record_lsns = Vec::default(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let end_watch = Env::write_wal( + tli, + start_lsn, + SIZE, + MSG_COUNT, + c"neon-file:", + Some(&mut next_record_lsns), + ) + .await + .unwrap(); + + assert!(next_record_lsns.len() > 3); + let shard_0_start_lsn = next_record_lsns[3]; + + let end_pos = end_watch.get(); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + shard_0_start_lsn, + end_pos, + end_watch, + WAL_READER_BATCH_SIZE, + ); + + let shard_0 = ShardIdentity::new( + ShardNumber(0), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let shard_1 = ShardIdentity::new( + ShardNumber(1), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let mut shards = HashMap::new(); + + for shard_number in 0..SHARD_COUNT { + let shard_id = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + shards.insert(shard_id, (Some(tx), Some(rx))); + } + + let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap(); + + let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel(); + + let reader = InterpretedWalReader::new( + streaming_wal_reader, + shard_0_start_lsn, + shard_0_tx, + shard_0, + PG_VERSION, + Some(shard_notification_rx), + ); + + let reader_state = reader.state(); + let mut reader_fut = std::pin::pin!(reader.run(shard_0_start_lsn, &None)); + loop { + let poll = futures::poll!(reader_fut.as_mut()); + assert!(poll.is_pending()); + + let guard = reader_state.read().unwrap(); + if guard.current_batch_wal_start().is_some() { + break; + } + } + + shard_notification_tx + .send(AttachShardNotification { + shard_id: shard_1, + sender: shards.get_mut(&shard_1).unwrap().0.take().unwrap(), + start_pos: start_lsn, + }) + .unwrap(); + + let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap(); + loop { + let poll = futures::poll!(reader_fut.as_mut()); + assert!(poll.is_pending()); + + let try_recv_res = shard_1_rx.try_recv(); + match try_recv_res { + Ok(batch) => { + assert_eq!(batch.records.raw_wal_start_lsn.unwrap(), start_lsn); + break; + } + Err(tokio::sync::mpsc::error::TryRecvError::Empty) => {} + Err(tokio::sync::mpsc::error::TryRecvError::Disconnected) => { + unreachable!(); + } + } + } + } + + #[tokio::test] + async fn test_shard_zero_does_not_skip_empty_records() { + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 8 * 1024; + const MSG_COUNT: usize = 10; + const PG_VERSION: u32 = 17; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let mut next_record_lsns = Vec::new(); + let end_watch = Env::write_wal( + tli, + start_lsn, + SIZE, + MSG_COUNT, + // This is a logical message prefix that is not persisted to key value storage. + // We use it in order to validate that shard zero receives emtpy interpreted records. + c"test:", + Some(&mut next_record_lsns), + ) + .await + .unwrap(); + let end_pos = end_watch.get(); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + start_lsn, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let shard = ShardIdentity::unsharded(); + let (records_tx, mut records_rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + + let handle = InterpretedWalReader::spawn( + streaming_wal_reader, + start_lsn, + records_tx, + shard, + PG_VERSION, + &Some("pageserver".to_string()), + ); + + let mut interpreted_records = Vec::new(); + while let Some(batch) = records_rx.recv().await { + interpreted_records.push(batch.records); + if batch.wal_end_lsn == batch.available_wal_end_lsn { + break; + } + } + + let received_next_record_lsns = interpreted_records + .into_iter() + .flat_map(|b| b.records) + .map(|rec| rec.next_record_lsn) + .collect::>(); + + // By default this also includes the start LSN. Trim it since it shouldn't be received. + let next_record_lsns = next_record_lsns.into_iter().skip(1).collect::>(); + + assert_eq!(received_next_record_lsns, next_record_lsns); + + handle.abort(); + let mut done = false; + for _ in 0..5 { + if handle.current_position().is_none() { + done = true; + break; + } + tokio::time::sleep(Duration::from_millis(1)).await; + } + + assert!(done); + } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 4a4a74a0fd..33e3d0485c 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -1,6 +1,34 @@ //! This module implements the streaming side of replication protocol, starting //! with the "START_REPLICATION" message, and registry of walsenders. +use std::cmp::{max, min}; +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::{Context as AnyhowContext, bail}; +use bytes::Bytes; +use futures::FutureExt; +use itertools::Itertools; +use parking_lot::Mutex; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; +use postgres_ffi::{MAX_SEND_SIZE, TimestampTz, get_current_timestamp}; +use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; +use safekeeper_api::Term; +use safekeeper_api::models::{ + HotStandbyFeedback, INVALID_FULL_TRANSACTION_ID, ReplicationFeedback, StandbyFeedback, + StandbyReply, +}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::sync::watch::Receiver; +use tokio::time::timeout; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::failpoint_support; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; +use utils::postgres_client::PostgresClientProtocol; + use crate::handler::SafekeeperPostgresHandler; use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS}; use crate::receive_wal::WalReceivers; @@ -11,34 +39,6 @@ use crate::send_interpreted_wal::{ use crate::timeline::WalResidentTimeline; use crate::wal_reader_stream::StreamingWalReader; use crate::wal_storage::WalReader; -use anyhow::{bail, Context as AnyhowContext}; -use bytes::Bytes; -use futures::FutureExt; -use parking_lot::Mutex; -use postgres_backend::PostgresBackend; -use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; -use postgres_ffi::get_current_timestamp; -use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; -use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; -use safekeeper_api::models::{ - HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply, - INVALID_FULL_TRANSACTION_ID, -}; -use safekeeper_api::Term; -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::failpoint_support; -use utils::pageserver_feedback::PageserverFeedback; -use utils::postgres_client::PostgresClientProtocol; - -use itertools::Itertools; -use std::cmp::{max, min}; -use std::net::SocketAddr; -use std::sync::Arc; -use std::time::Duration; -use tokio::sync::watch::Receiver; -use tokio::time::timeout; -use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn}; // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; @@ -624,8 +624,9 @@ impl SafekeeperPostgresHandler { MAX_SEND_SIZE, ); - let reader = - InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version); + let reader = InterpretedWalReader::new( + wal_reader, start_pos, tx, shard, pg_version, None, + ); let sender = InterpretedWalSender { format, @@ -905,9 +906,9 @@ impl WalSender<'_, IO> { // pageserver to identify WalReceiverError::SuccessfulCompletion, // do not change this string without updating pageserver. return Err(CopyStreamHandlerEnd::ServerInitiated(format!( - "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", - self.appname, self.start_pos, - ))); + "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", + self.appname, self.start_pos, + ))); } } } diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 4d566b12a0..7533005c35 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -1,28 +1,24 @@ //! Defines per timeline data stored persistently (SafeKeeperPersistentState) //! and its wrapper with in memory layer (SafekeeperState). -use std::{cmp::max, ops::Deref, time::SystemTime}; +use std::cmp::max; +use std::ops::Deref; +use std::time::SystemTime; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::{ - membership::Configuration, - models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}, - ServerInfo, Term, INITIAL_TERM, -}; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}; +use safekeeper_api::{INITIAL_TERM, ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::info; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; -use crate::{ - control_file, - safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION}, - timeline::TimelineError, - wal_backup_partial::{self}, -}; +use crate::control_file; +use crate::safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION}; +use crate::timeline::TimelineError; +use crate::wal_backup_partial::{self}; /// Persistent information stored on safekeeper node about timeline. /// On disk data is prefixed by magic and format version and followed by checksum. @@ -272,7 +268,7 @@ where // Is switch allowed? if to.generation <= self.mconf.generation { info!( - "ignoring request to switch membership conf to lower {}, current conf {}", + "ignoring request to switch membership conf to {}, current conf {}", to, self.mconf ); } else { diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index 79ceddd366..618e2b59d2 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -1,5 +1,13 @@ +use std::ffi::CStr; use std::sync::Arc; +use camino_tempfile::Utf8TempDir; +use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; +use safekeeper_api::membership::SafekeeperGeneration as Generation; +use tokio::fs::create_dir_all; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; + use crate::rate_limit::RateLimiter; use crate::receive_wal::WalAcceptor; use crate::safekeeper::{ @@ -8,15 +16,10 @@ use crate::safekeeper::{ }; use crate::send_wal::EndWatch; use crate::state::{TimelinePersistentState, TimelineState}; -use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline}; +use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::remote_timeline_path; -use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf}; -use camino_tempfile::Utf8TempDir; -use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; -use tokio::fs::create_dir_all; -use utils::id::{NodeId, TenantTimelineId}; -use utils::lsn::Lsn; +use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage}; /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop. pub struct Env { @@ -73,10 +76,10 @@ impl Env { // Emulate an initial election. safekeeper .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected { + generation: Generation::new(0), term: 1, start_streaming_at: start_lsn, term_history: TermHistory(vec![(1, start_lsn).into()]), - timeline_start_lsn: start_lsn, })) .await?; @@ -122,6 +125,7 @@ impl Env { start_lsn: Lsn, msg_size: usize, msg_count: usize, + prefix: &CStr, mut next_record_lsns: Option<&mut Vec>, ) -> anyhow::Result { let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE); @@ -131,7 +135,6 @@ impl Env { WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0)); - let prefix = c"neon-file:"; let prefixlen = prefix.to_bytes_with_nul().len(); assert!(msg_size >= prefixlen); let message = vec![0; msg_size - prefixlen]; @@ -146,13 +149,12 @@ impl Env { let req = AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: start_lsn, begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: lsn, truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4341f13824..d3c841ec09 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -1,37 +1,32 @@ //! This module implements Timeline lifecycle management and has all necessary code //! to glue together SafeKeeper and all other background services. -use anyhow::{anyhow, bail, Result}; +use std::cmp::max; +use std::ops::{Deref, DerefMut}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::time::Duration; + +use anyhow::{Result, anyhow, bail}; use camino::{Utf8Path, Utf8PathBuf}; +use http_utils::error::ApiError; use remote_storage::RemotePath; +use safekeeper_api::Term; use safekeeper_api::membership::Configuration; use safekeeper_api::models::{ PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse, }; -use safekeeper_api::Term; +use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; use tokio::fs::{self}; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard, watch}; +use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use utils::id::TenantId; +use tracing::*; +use utils::id::{NodeId, TenantId, TenantTimelineId}; +use utils::lsn::Lsn; use utils::sync::gate::Gate; -use http_utils::error::ApiError; -use std::cmp::max; -use std::ops::{Deref, DerefMut}; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::Arc; -use std::time::Duration; -use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; -use tokio::{sync::watch, time::Instant}; -use tracing::*; -use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, -}; - -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; - -use crate::control_file; +use crate::metrics::{FullTimelineInfo, MISC_OPERATION_SECONDS, WalStorageMetrics}; use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn}; @@ -42,11 +37,8 @@ use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::{self, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; - -use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS}; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; -use crate::SafeKeeperConf; -use crate::{debug_dump, timeline_manager, wal_storage}; +use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage}; fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo { PeerInfo { @@ -168,7 +160,7 @@ impl StateSK { pub fn state(&self) -> &TimelineState { match self { StateSK::Loaded(sk) => &sk.state, - StateSK::Offloaded(ref s) => s, + StateSK::Offloaded(s) => s, StateSK::Empty => unreachable!(), } } @@ -176,7 +168,7 @@ impl StateSK { pub fn state_mut(&mut self) -> &mut TimelineState { match self { StateSK::Loaded(sk) => &mut sk.state, - StateSK::Offloaded(ref mut s) => s, + StateSK::Offloaded(s) => s, StateSK::Empty => unreachable!(), } } @@ -423,6 +415,9 @@ impl From for ApiError { } } +/// We run remote deletion in a background task, this is how it sends its results back. +type RemoteDeletionReceiver = tokio::sync::watch::Receiver>>; + /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. /// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { @@ -454,6 +449,8 @@ pub struct Timeline { manager_ctl: ManagerCtl, conf: Arc, + remote_deletion: std::sync::Mutex>, + /// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding /// this gate, you must respect [`Timeline::cancel`] pub(crate) gate: Gate, @@ -502,6 +499,7 @@ impl Timeline { walreceivers, gate: Default::default(), cancel: CancellationToken::default(), + remote_deletion: std::sync::Mutex::new(None), manager_ctl: ManagerCtl::new(), conf, broker_active: AtomicBool::new(false), @@ -566,11 +564,18 @@ impl Timeline { }); } - /// Background timeline activities (which hold Timeline::gate) will no - /// longer run once this function completes. - pub async fn shutdown(&self) { + /// Cancel the timeline, requesting background activity to stop. Closing + /// the `self.gate` waits for that. + pub async fn cancel(&self) { info!("timeline {} shutting down", self.ttid); self.cancel.cancel(); + } + + /// Background timeline activities (which hold Timeline::gate) will no + /// longer run once this function completes. `Self::cancel` must have been + /// already called. + pub async fn close(&self) { + assert!(self.cancel.is_cancelled()); // Wait for any concurrent tasks to stop using this timeline, to avoid e.g. attempts // to read deleted files. @@ -582,13 +587,13 @@ impl Timeline { /// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but /// deletion API endpoint is retriable. /// - /// Timeline must be in shut-down state (i.e. call [`Self::shutdown`] first) + /// Timeline must be in shut-down state (i.e. call [`Self::close`] first) pub async fn delete( &self, shared_state: &mut WriteGuardSharedState<'_>, only_local: bool, ) -> Result { - // Assert that [`Self::shutdown`] was already called + // Assert that [`Self::close`] was already called assert!(self.cancel.is_cancelled()); assert!(self.gate.close_complete()); @@ -599,15 +604,95 @@ impl Timeline { shared_state.sk.close_wal_store(); if !only_local && self.conf.is_wal_backup_enabled() { - // Note: we concurrently delete remote storage data from multiple - // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we - // do some retries anyway. - wal_backup::delete_timeline(&self.ttid).await?; + self.remote_delete().await?; } let dir_existed = delete_dir(&self.timeline_dir).await?; Ok(dir_existed) } + /// Delete timeline content from remote storage. If the returned future is dropped, + /// deletion will continue in the background. + /// + /// This function ordinarily spawns a task and stashes a result receiver into [`Self::remote_deletion`]. If + /// deletion is already happening, it may simply wait for an existing task's result. + /// + /// Note: we concurrently delete remote storage data from multiple + /// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we + /// do some retries anyway. + async fn remote_delete(&self) -> Result<()> { + // We will start a background task to do the deletion, so that it proceeds even if our + // API request is dropped. Future requests will see the existing deletion task and wait + // for it to complete. + let mut result_rx = { + let mut remote_deletion_state = self.remote_deletion.lock().unwrap(); + let result_rx = if let Some(result_rx) = remote_deletion_state.as_ref() { + if let Some(result) = result_rx.borrow().as_ref() { + if let Err(e) = result { + // A previous remote deletion failed: we will start a new one + tracing::error!("remote deletion failed, will retry ({e})"); + None + } else { + // A previous remote deletion call already succeeded + return Ok(()); + } + } else { + // Remote deletion is still in flight + Some(result_rx.clone()) + } + } else { + // Remote deletion was not attempted yet, start it now. + None + }; + + match result_rx { + Some(result_rx) => result_rx, + None => self.start_remote_delete(&mut remote_deletion_state), + } + }; + + // Wait for a result + let Ok(result) = result_rx.wait_for(|v| v.is_some()).await else { + // Unexpected: sender should always send a result before dropping the channel, even if it has an error + return Err(anyhow::anyhow!( + "remote deletion task future was dropped without sending a result" + )); + }; + + result + .as_ref() + .expect("We did a wait_for on this being Some above") + .as_ref() + .map(|_| ()) + .map_err(|e| anyhow::anyhow!("remote deletion failed: {e}")) + } + + /// Spawn background task to do remote deletion, return a receiver for its outcome + fn start_remote_delete( + &self, + guard: &mut std::sync::MutexGuard>, + ) -> RemoteDeletionReceiver { + tracing::info!("starting remote deletion"); + let (result_tx, result_rx) = tokio::sync::watch::channel(None); + let ttid = self.ttid; + tokio::task::spawn( + async move { + let r = wal_backup::delete_timeline(&ttid).await; + if let Err(e) = &r { + // Log error here in case nobody ever listens for our result (e.g. dropped API request) + tracing::error!("remote deletion failed: {e}"); + } + + // Ignore send results: it's legal for the Timeline to give up waiting for us. + let _ = result_tx.send(Some(r)); + } + .instrument(info_span!("remote_delete", timeline = %self.ttid)), + ); + + **guard = Some(result_rx.clone()); + + result_rx + } + /// Returns if timeline is cancelled. pub fn is_cancelled(&self) -> bool { self.cancel.is_cancelled() @@ -1114,7 +1199,7 @@ impl ManagerTimeline { } /// Deletes directory and it's contents. Returns false if directory does not exist. -async fn delete_dir(path: &Utf8PathBuf) -> Result { +pub async fn delete_dir(path: &Utf8PathBuf) -> Result { match fs::remove_dir_all(path).await { Ok(_) => Ok(true), Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 303421c837..06ccb32d03 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -7,23 +7,19 @@ use anyhow::Context; use camino::Utf8PathBuf; use remote_storage::RemotePath; -use tokio::{ - fs::File, - io::{AsyncRead, AsyncWriteExt}, -}; +use tokio::fs::File; +use tokio::io::{AsyncRead, AsyncWriteExt}; use tracing::{debug, info, instrument, warn}; use utils::crashsafe::durable_rename; -use crate::{ - metrics::{ - EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES, - }, - rate_limit::rand_duration, - timeline_manager::{Manager, StateSnapshot}, - wal_backup, - wal_backup_partial::{self, PartialRemoteSegment}, - wal_storage::wal_file_paths, +use crate::metrics::{ + EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, EvictionEvent, NUM_EVICTED_TIMELINES, }; +use crate::rate_limit::rand_duration; +use crate::timeline_manager::{Manager, StateSnapshot}; +use crate::wal_backup; +use crate::wal_backup_partial::{self, PartialRemoteSegment}; +use crate::wal_storage::wal_file_paths; impl Manager { /// Returns true if the timeline is ready for eviction. diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index a33994dcab..71e99a4de7 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -7,41 +7,36 @@ //! Be aware that you need to be extra careful with manager code, because it is not respawned on panic. //! Also, if it will stuck in some branch, it will prevent any further progress in the timeline. -use std::{ - sync::{atomic::AtomicUsize, Arc}, - time::Duration, -}; +use std::sync::Arc; +use std::sync::atomic::AtomicUsize; +use std::time::Duration; use futures::channel::oneshot; use postgres_ffi::XLogSegNo; -use safekeeper_api::{models::PeerInfo, Term}; +use safekeeper_api::Term; +use safekeeper_api::models::PeerInfo; use serde::{Deserialize, Serialize}; -use tokio::{ - task::{JoinError, JoinHandle}, - time::Instant, -}; +use tokio::task::{JoinError, JoinHandle}; +use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, instrument, warn, Instrument}; +use tracing::{Instrument, debug, info, info_span, instrument, warn}; use utils::lsn::Lsn; -use crate::{ - control_file::{FileStorage, Storage}, - metrics::{ - MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, - NUM_EVICTED_TIMELINES, - }, - rate_limit::{rand_duration, RateLimiter}, - recovery::recovery_main, - remove_wal::calc_horizon_lsn, - send_wal::WalSenders, - state::TimelineState, - timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}, - timeline_guard::{AccessService, GuardId, ResidenceGuard}, - timelines_set::{TimelineSetGuard, TimelinesSet}, - wal_backup::{self, WalBackupTaskHandle}, - wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}, - SafeKeeperConf, +use crate::SafeKeeperConf; +use crate::control_file::{FileStorage, Storage}; +use crate::metrics::{ + MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, NUM_EVICTED_TIMELINES, }; +use crate::rate_limit::{RateLimiter, rand_duration}; +use crate::recovery::recovery_main; +use crate::remove_wal::calc_horizon_lsn; +use crate::send_wal::WalSenders; +use crate::state::TimelineState; +use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}; +use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard}; +use crate::timelines_set::{TimelineSetGuard, TimelinesSet}; +use crate::wal_backup::{self, WalBackupTaskHandle}; +use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}; pub(crate) struct StateSnapshot { // inmem values diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 1ff6a72bce..41abee369e 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -2,31 +2,32 @@ //! All timelines should always be present in this map, this is done by loading them //! all from the disk on startup and keeping them in memory. -use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; -use crate::rate_limit::RateLimiter; -use crate::state::TimelinePersistentState; -use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; -use crate::timelines_set::TimelinesSet; -use crate::wal_storage::Storage; -use crate::{control_file, wal_storage, SafeKeeperConf}; -use anyhow::{bail, Context, Result}; +use std::collections::HashMap; +use std::str::FromStr; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use anyhow::{Context, Result, bail}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use safekeeper_api::membership::Configuration; -use safekeeper_api::models::SafekeeperUtilization; -use safekeeper_api::ServerInfo; -use serde::Serialize; -use std::collections::HashMap; -use std::str::FromStr; -use std::sync::atomic::Ordering; -use std::sync::{Arc, Mutex}; -use std::time::{Duration, Instant}; +use safekeeper_api::models::{SafekeeperUtilization, TimelineDeleteResult}; +use safekeeper_api::{ServerInfo, membership}; use tokio::fs; use tracing::*; use utils::crashsafe::{durable_rename, fsync_async_opt}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; +use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; +use crate::http::routes::DeleteOrExcludeError; +use crate::rate_limit::RateLimiter; +use crate::state::TimelinePersistentState; +use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir}; +use crate::timelines_set::TimelinesSet; +use crate::wal_storage::Storage; +use crate::{SafeKeeperConf, control_file, wal_storage}; + // Timeline entry in the global map: either a ready timeline, or mark that it is // being created. #[derive(Clone)] @@ -446,23 +447,20 @@ impl GlobalTimelines { .collect() } - /// Cancels timeline, then deletes the corresponding data directory. - /// If only_local, doesn't remove WAL segments in remote storage. - pub(crate) async fn delete( + /// Delete timeline, only locally on this node or globally (also cleaning + /// remote storage WAL), depending on `action` value. + pub(crate) async fn delete_or_exclude( &self, ttid: &TenantTimelineId, - only_local: bool, - ) -> Result { + action: DeleteOrExclude, + ) -> Result { let tli_res = { let state = self.state.lock().unwrap(); if state.tombstones.contains_key(ttid) { // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do. info!("Timeline {ttid} was already deleted"); - return Ok(TimelineDeleteForceResult { - dir_existed: false, - was_active: false, - }); + return Ok(TimelineDeleteResult { dir_existed: false }); } state.get(ttid) @@ -470,32 +468,47 @@ impl GlobalTimelines { let result = match tli_res { Ok(timeline) => { - let was_active = timeline.broker_active.load(Ordering::Relaxed); + info!("deleting timeline {}, action={:?}", ttid, action); - info!("deleting timeline {}, only_local={}", ttid, only_local); - timeline.shutdown().await; + // If node is getting excluded, check the generation first. + // Then, while holding the lock cancel the timeline; it will be + // unusable after this point, and if node is added back first + // deletion must be completed and node seeded anew. + // + // We would like to avoid holding the lock while waiting for the + // gate to finish as this is deadlock prone, so for actual + // deletion will take it second time. + if let DeleteOrExclude::Exclude(ref mconf) = action { + let shared_state = timeline.read_shared_state().await; + if shared_state.sk.state().mconf.generation > mconf.generation { + return Err(DeleteOrExcludeError::Conflict { + requested: mconf.clone(), + current: shared_state.sk.state().mconf.clone(), + }); + } + timeline.cancel().await; + } else { + timeline.cancel().await; + } + + timeline.close().await; info!("timeline {ttid} shut down for deletion"); // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; + let only_local = !matches!(action, DeleteOrExclude::Delete); let dir_existed = timeline.delete(&mut shared_state, only_local).await?; - Ok(TimelineDeleteForceResult { - dir_existed, - was_active, // TODO: we probably should remove this field - }) + Ok(TimelineDeleteResult { dir_existed }) } Err(_) => { // Timeline is not memory, but it may still exist on disk in broken state. let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid); - let dir_existed = delete_dir(dir_path)?; + let dir_existed = delete_dir(&dir_path).await?; - Ok(TimelineDeleteForceResult { - dir_existed, - was_active: false, - }) + Ok(TimelineDeleteResult { dir_existed }) } }; @@ -513,11 +526,11 @@ impl GlobalTimelines { /// retry tenant deletion again later. /// /// If only_local, doesn't remove WAL segments in remote storage. - pub async fn delete_force_all_for_tenant( + pub async fn delete_all_for_tenant( &self, tenant_id: &TenantId, - only_local: bool, - ) -> Result> { + action: DeleteOrExclude, + ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); let to_delete = self.get_all_for_tenant(*tenant_id); @@ -525,7 +538,7 @@ impl GlobalTimelines { let mut deleted = HashMap::new(); for tli in &to_delete { - match self.delete(&tli.ttid, only_local).await { + match self.delete_or_exclude(&tli.ttid, action.clone()).await { Ok(result) => { deleted.insert(tli.ttid, result); } @@ -539,17 +552,15 @@ impl GlobalTimelines { // If there was an error, return it. if let Some(e) = err { - return Err(e); + return Err(anyhow::Error::from(e)); } // There may be broken timelines on disk, so delete the whole tenant dir as well. // Note that we could concurrently create new timelines while we were deleting them, // so the directory may be not empty. In this case timelines will have bad state // and timeline background jobs can panic. - delete_dir(get_tenant_dir( - self.state.lock().unwrap().conf.as_ref(), - tenant_id, - ))?; + let tenant_dir = get_tenant_dir(self.state.lock().unwrap().conf.as_ref(), tenant_id); + delete_dir(&tenant_dir).await?; Ok(deleted) } @@ -567,19 +578,16 @@ impl GlobalTimelines { } } -#[derive(Clone, Copy, Serialize)] -pub struct TimelineDeleteForceResult { - pub dir_existed: bool, - pub was_active: bool, -} - -/// Deletes directory and it's contents. Returns false if directory does not exist. -fn delete_dir(path: Utf8PathBuf) -> Result { - match std::fs::remove_dir_all(path) { - Ok(_) => Ok(true), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), - Err(e) => Err(e.into()), - } +/// Action for delete_or_exclude. +#[derive(Clone, Debug)] +pub enum DeleteOrExclude { + /// Delete timeline globally. + Delete, + /// Legacy mode until we fully migrate to generations: like exclude deletes + /// timeline only locally, but ignores generation number. + DeleteLocal, + /// This node is getting excluded, delete timeline locally. + Exclude(membership::Configuration), } /// Create temp directory for a new timeline. It needs to be located on the same diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs index 096e348295..1d1abc530f 100644 --- a/safekeeper/src/timelines_set.rs +++ b/safekeeper/src/timelines_set.rs @@ -1,4 +1,5 @@ -use std::{collections::HashMap, sync::Arc}; +use std::collections::HashMap; +use std::sync::Arc; use utils::id::TenantTimelineId; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 2f6b91cf47..56f4a2faf9 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -1,34 +1,29 @@ -use anyhow::{Context, Result}; - -use camino::{Utf8Path, Utf8PathBuf}; -use futures::stream::FuturesOrdered; -use futures::StreamExt; -use safekeeper_api::models::PeerInfo; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; -use utils::backoff; -use utils::id::NodeId; - use std::cmp::min; use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; use std::time::Duration; +use anyhow::{Context, Result}; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use futures::stream::FuturesOrdered; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; -use postgres_ffi::XLogFileName; -use postgres_ffi::{XLogSegNo, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::{ DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata, }; +use safekeeper_api::models::PeerInfo; use tokio::fs::File; - use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::{watch, OnceCell}; +use tokio::sync::{OnceCell, watch}; +use tokio::task::JoinHandle; +use tokio_util::sync::CancellationToken; use tracing::*; - -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; +use utils::{backoff, pausable_failpoint}; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; use crate::timeline::WalResidentTimeline; @@ -569,6 +564,12 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { // We don't currently have http requests timeout cancellation, but if/once // we have listing should get streaming interface to make progress. + pausable_failpoint!("sk-delete-timeline-remote-pause"); + + fail::fail_point!("sk-delete-timeline-remote", |_| { + Err(anyhow::anyhow!("failpoint: sk-delete-timeline-remote")) + }); + let cancel = CancellationToken::new(); // not really used backoff::retry( || async { diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 5ecb23e8e0..049852a048 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -20,23 +20,23 @@ //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. use camino::Utf8PathBuf; -use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::RemotePath; use safekeeper_api::Term; use serde::{Deserialize, Serialize}; - use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; -use utils::{id::NodeId, lsn::Lsn}; +use utils::id::NodeId; +use utils::lsn::Lsn; -use crate::{ - metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, - rate_limit::{rand_duration, RateLimiter}, - timeline::WalResidentTimeline, - timeline_manager::StateSnapshot, - wal_backup::{self}, - SafeKeeperConf, +use crate::SafeKeeperConf; +use crate::metrics::{ + MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS, }; +use crate::rate_limit::{RateLimiter, rand_duration}; +use crate::timeline::WalResidentTimeline; +use crate::timeline_manager::StateSnapshot; +use crate::wal_backup::{self}; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum UploadStatus { diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index a0dd571a34..aab82fedb5 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -1,14 +1,15 @@ -use std::{ - pin::Pin, - task::{Context, Poll}, -}; +use std::pin::Pin; +use std::task::{Context, Poll}; use bytes::Bytes; -use futures::{stream::BoxStream, Stream, StreamExt}; +use futures::stream::BoxStream; +use futures::{Stream, StreamExt}; +use safekeeper_api::Term; use utils::lsn::Lsn; -use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader}; -use safekeeper_api::Term; +use crate::send_wal::EndWatch; +use crate::timeline::WalResidentTimeline; +use crate::wal_storage::WalReader; #[derive(PartialEq, Eq, Debug)] pub(crate) struct WalBytes { @@ -224,12 +225,11 @@ mod tests { use futures::StreamExt; use postgres_ffi::MAX_SEND_SIZE; - use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, - }; + use utils::id::{NodeId, TenantTimelineId}; + use utils::lsn::Lsn; - use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader}; + use crate::test_utils::Env; + use crate::wal_reader_stream::StreamingWalReader; #[tokio::test] async fn test_streaming_wal_reader_reset() { @@ -246,7 +246,7 @@ mod tests { .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, c"neon-file:", None) .await .unwrap(); let end_pos = end_watch.get(); diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index e5ccbb3230..045fa88cb0 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -2,23 +2,23 @@ //! WAL service listens for client connections and //! receive WAL from wal_proposer and send it to WAL receivers //! -use anyhow::{Context, Result}; -use postgres_backend::QueryError; -use safekeeper_api::models::ConnectionId; +use std::os::fd::AsRawFd; use std::sync::Arc; use std::time::Duration; + +use anyhow::{Context, Result}; +use postgres_backend::{AuthType, PostgresBackend, QueryError}; +use safekeeper_api::models::ConnectionId; use tokio::net::TcpStream; use tokio_io_timeout::TimeoutReader; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{auth::Scope, measured_stream::MeasuredStream}; - -use std::os::fd::AsRawFd; +use utils::auth::Scope; +use utils::measured_stream::MeasuredStream; +use crate::handler::SafekeeperPostgresHandler; use crate::metrics::TrafficMetrics; -use crate::SafeKeeperConf; -use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines}; -use postgres_backend::{AuthType, PostgresBackend}; +use crate::{GlobalTimelines, SafeKeeperConf}; /// Accept incoming TCP connections and spawn them into a background thread. /// diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index e338d70731..f0bac4b40a 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -7,32 +7,32 @@ //! //! Note that last file has `.partial` suffix, that's different from postgres. -use anyhow::{bail, Context, Result}; -use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; -use futures::future::BoxFuture; -use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; -use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI}; -use remote_storage::RemotePath; use std::cmp::{max, min}; use std::future::Future; use std::io::{self, SeekFrom}; use std::pin::Pin; -use tokio::fs::{self, remove_file, File, OpenOptions}; -use tokio::io::{AsyncRead, AsyncWriteExt}; -use tokio::io::{AsyncReadExt, AsyncSeekExt}; + +use anyhow::{Context, Result, bail}; +use bytes::Bytes; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::future::BoxFuture; +use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion}; +use pq_proto::SystemId; +use remote_storage::RemotePath; +use tokio::fs::{self, File, OpenOptions, remove_file}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tracing::*; use utils::crashsafe::durable_rename; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; use crate::metrics::{ - time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, + REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure, }; use crate::state::TimelinePersistentState; use crate::wal_backup::{read_object, remote_timeline_path}; -use postgres_ffi::waldecoder::WalStreamDecoder; -use postgres_ffi::XLogFileName; -use pq_proto::SystemId; -use utils::{id::TenantTimelineId, lsn::Lsn}; pub trait Storage { // Last written LSN. @@ -200,7 +200,12 @@ impl PhysicalStorage { ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); if flush_lsn < state.commit_lsn { - bail!("timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", ttid.timeline_id, flush_lsn, state.commit_lsn); + bail!( + "timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", + ttid.timeline_id, + flush_lsn, + state.commit_lsn + ); } if flush_lsn < state.peer_horizon_lsn { warn!( @@ -569,6 +574,7 @@ impl Storage for PhysicalStorage { } self.pending_wal_truncation = false; + info!("truncated WAL to {}", end_pos); Ok(()) } diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs index 8e5b17a143..8e54d2bb86 100644 --- a/safekeeper/tests/misc_test.rs +++ b/safekeeper/tests/misc_test.rs @@ -3,9 +3,9 @@ use std::sync::Arc; use tracing::{info, warn}; use utils::lsn::Lsn; -use crate::walproposer_sim::{ - log::{init_logger, init_tracing_logger}, - simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig}, +use crate::walproposer_sim::log::{init_logger, init_tracing_logger}; +use crate::walproposer_sim::simulation::{ + Schedule, TestAction, TestConfig, generate_network_opts, generate_schedule, }; pub mod walproposer_sim; diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs index 1a932ef699..e29b58836a 100644 --- a/safekeeper/tests/random_test.rs +++ b/safekeeper/tests/random_test.rs @@ -1,11 +1,9 @@ use rand::Rng; use tracing::{info, warn}; -use crate::walproposer_sim::{ - log::{init_logger, init_tracing_logger}, - simulation::{generate_network_opts, generate_schedule, TestConfig}, - simulation_logs::validate_events, -}; +use crate::walproposer_sim::log::{init_logger, init_tracing_logger}; +use crate::walproposer_sim::simulation::{TestConfig, generate_network_opts, generate_schedule}; +use crate::walproposer_sim::simulation_logs::validate_events; pub mod walproposer_sim; @@ -18,7 +16,7 @@ fn test_random_schedules() -> anyhow::Result<()> { let mut config = TestConfig::new(Some(clock)); for _ in 0..500 { - let seed: u64 = rand::thread_rng().gen(); + let seed: u64 = rand::thread_rng().r#gen(); config.network = generate_network_opts(seed); let test = config.start(seed); diff --git a/safekeeper/tests/simple_test.rs b/safekeeper/tests/simple_test.rs index 0be9d0deef..f7b266e39c 100644 --- a/safekeeper/tests/simple_test.rs +++ b/safekeeper/tests/simple_test.rs @@ -1,7 +1,8 @@ use tracing::info; use utils::lsn::Lsn; -use crate::walproposer_sim::{log::init_logger, simulation::TestConfig}; +use crate::walproposer_sim::log::init_logger; +use crate::walproposer_sim::simulation::TestConfig; pub mod walproposer_sim; diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs index 870f30de4f..e2ba3282ca 100644 --- a/safekeeper/tests/walproposer_sim/log.rs +++ b/safekeeper/tests/walproposer_sim/log.rs @@ -1,9 +1,11 @@ -use std::{fmt, sync::Arc}; +use std::fmt; +use std::sync::Arc; use desim::time::Timing; use once_cell::sync::OnceCell; use parking_lot::Mutex; -use tracing_subscriber::fmt::{format::Writer, time::FormatTime}; +use tracing_subscriber::fmt::format::Writer; +use tracing_subscriber::fmt::time::FormatTime; /// SimClock can be plugged into tracing logger to print simulation time. #[derive(Clone)] diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 0023a4d22a..0dfdafcc51 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -2,33 +2,30 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use bytes::{Bytes, BytesMut}; use camino::Utf8PathBuf; -use desim::{ - executor::{self, PollSome}, - network::TCP, - node_os::NodeOs, - proto::{AnyMessage, NetEvent, NodeEvent}, -}; +use desim::executor::{self, PollSome}; +use desim::network::TCP; +use desim::node_os::NodeOs; +use desim::proto::{AnyMessage, NetEvent, NodeEvent}; use http::Uri; -use safekeeper::{ - safekeeper::{ - ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION, - }, - state::{TimelinePersistentState, TimelineState}, - timeline::TimelineError, - wal_storage::Storage, - SafeKeeperConf, +use safekeeper::SafeKeeperConf; +use safekeeper::safekeeper::{ + ProposerAcceptorMessage, SK_PROTO_VERSION_3, SafeKeeper, UNKNOWN_SERVER_VERSION, }; -use safekeeper_api::{membership::Configuration, ServerInfo}; +use safekeeper::state::{TimelinePersistentState, TimelineState}; +use safekeeper::timeline::TimelineError; +use safekeeper::wal_storage::Storage; +use safekeeper_api::ServerInfo; +use safekeeper_api::membership::Configuration; use tracing::{debug, info_span, warn}; -use utils::{ - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use super::safekeeper_disk::{DiskStateStorage, DiskWALStorage, SafekeeperDisk, TimelineDisk}; @@ -155,6 +152,7 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { my_id: NodeId(os.id() as u64), listen_pg_addr: String::new(), listen_http_addr: String::new(), + listen_https_addr: None, no_sync: false, broker_endpoint: "/".parse::().unwrap(), broker_keepalive_interval: Duration::from_secs(0), @@ -182,6 +180,9 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { eviction_min_resident: Duration::ZERO, wal_reader_fanout: false, max_delta_for_fanout: None, + ssl_key_file: Utf8PathBuf::from(""), + ssl_cert_file: Utf8PathBuf::from(""), + ssl_ca_cert: None, }; let mut global = GlobalMap::new(disk, conf.clone())?; @@ -287,7 +288,7 @@ impl ConnState { bail!("finished processing START_REPLICATION") } - let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?; + let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTO_VERSION_3)?; debug!("got msg: {:?}", msg); self.process(msg, global) } else { @@ -403,7 +404,7 @@ impl ConnState { // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn let mut buf = BytesMut::with_capacity(128); - reply.serialize(&mut buf)?; + reply.serialize(&mut buf, SK_PROTO_VERSION_3)?; self.tcp.send(AnyMessage::Bytes(buf.into())); } diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs index b854754ecf..94a849b5f0 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -1,22 +1,23 @@ use std::collections::HashMap; +use std::ops::Deref; use std::sync::Arc; - -use parking_lot::Mutex; -use safekeeper::state::TimelinePersistentState; -use utils::id::TenantTimelineId; - -use super::block_storage::BlockStorage; - -use std::{ops::Deref, time::Instant}; +use std::time::Instant; use anyhow::Result; use bytes::{Buf, BytesMut}; use futures::future::BoxFuture; -use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo}; -use safekeeper::{control_file, metrics::WalStorageMetrics, wal_storage}; +use parking_lot::Mutex; +use postgres_ffi::XLogSegNo; +use postgres_ffi::waldecoder::WalStreamDecoder; +use safekeeper::metrics::WalStorageMetrics; +use safekeeper::state::TimelinePersistentState; +use safekeeper::{control_file, wal_storage}; use tracing::{debug, info}; +use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use super::block_storage::BlockStorage; + /// All safekeeper state that is usually saved to disk. pub struct SafekeeperDisk { pub timelines: Mutex>>, diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs index fabf450eef..f314143952 100644 --- a/safekeeper/tests/walproposer_sim/simulation.rs +++ b/safekeeper/tests/walproposer_sim/simulation.rs @@ -1,23 +1,24 @@ -use std::{cell::Cell, str::FromStr, sync::Arc}; +use std::cell::Cell; +use std::str::FromStr; +use std::sync::Arc; -use crate::walproposer_sim::{safekeeper::run_server, walproposer_api::SimulationApi}; -use desim::{ - executor::{self, ExternalHandle}, - node_os::NodeOs, - options::{Delay, NetworkOptions}, - proto::{AnyMessage, NodeEvent}, - world::Node, - world::World, -}; +use desim::executor::{self, ExternalHandle}; +use desim::node_os::NodeOs; +use desim::options::{Delay, NetworkOptions}; +use desim::proto::{AnyMessage, NodeEvent}; +use desim::world::{Node, World}; use rand::{Rng, SeedableRng}; use tracing::{debug, info_span, warn}; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; use walproposer::walproposer::{Config, Wrapper}; -use super::{ - log::SimClock, safekeeper_disk::SafekeeperDisk, walproposer_api, - walproposer_disk::DiskWalProposer, -}; +use super::log::SimClock; +use super::safekeeper_disk::SafekeeperDisk; +use super::walproposer_api; +use super::walproposer_disk::DiskWalProposer; +use crate::walproposer_sim::safekeeper::run_server; +use crate::walproposer_sim::walproposer_api::SimulationApi; /// Simulated safekeeper node. pub struct SafekeeperNode { diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index 5578c94cf6..82e7a32881 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -1,26 +1,20 @@ -use std::{ - cell::{RefCell, RefMut, UnsafeCell}, - ffi::CStr, - sync::Arc, -}; +use std::cell::{RefCell, RefMut, UnsafeCell}; +use std::ffi::CStr; +use std::sync::Arc; use bytes::Bytes; -use desim::{ - executor::{self, PollSome}, - network::TCP, - node_os::NodeOs, - proto::{AnyMessage, NetEvent, NodeEvent}, - world::NodeId, -}; +use desim::executor::{self, PollSome}; +use desim::network::TCP; +use desim::node_os::NodeOs; +use desim::proto::{AnyMessage, NetEvent, NodeEvent}; +use desim::world::NodeId; use tracing::debug; use utils::lsn::Lsn; -use walproposer::{ - api_bindings::Level, - bindings::{ - NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, - }, - walproposer::{ApiImpl, Config}, +use walproposer::api_bindings::Level; +use walproposer::bindings::{ + NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, }; +use walproposer::walproposer::{ApiImpl, Config}; use super::walproposer_disk::DiskWalProposer; @@ -517,8 +511,7 @@ impl ApiImpl for SimulationApi { // collected quorum with lower term, then got rejected by next connected safekeeper executor::exit(1, msg.to_owned()); } - if msg.contains("collected propEpochStartLsn") && msg.contains(", but basebackup LSN ") - { + if msg.contains("collected propTermStartLsn") && msg.contains(", but basebackup LSN ") { // sync-safekeepers collected wrong quorum, walproposer collected another quorum executor::exit(1, msg.to_owned()); } @@ -535,7 +528,7 @@ impl ApiImpl for SimulationApi { } fn after_election(&self, wp: &mut walproposer::bindings::WalProposer) { - let prop_lsn = wp.propEpochStartLsn; + let prop_lsn = wp.propTermStartLsn; let prop_term = wp.propTerm; let mut prev_lsn: u64 = 0; @@ -578,7 +571,9 @@ impl ApiImpl for SimulationApi { let disk_lsn = disk.lock().flush_rec_ptr().0; debug!("start_streaming at {} (disk_lsn={})", startpos, disk_lsn); if startpos < disk_lsn { - debug!("startpos < disk_lsn, it means we wrote some transaction even before streaming started"); + debug!( + "startpos < disk_lsn, it means we wrote some transaction even before streaming started" + ); } assert!(startpos <= disk_lsn); let mut broadcasted = Lsn(startpos); @@ -616,7 +611,7 @@ impl ApiImpl for SimulationApi { sk: &mut walproposer::bindings::Safekeeper, ) -> bool { let mut startpos = wp.truncateLsn; - let endpos = wp.propEpochStartLsn; + let endpos = wp.propTermStartLsn; if startpos == endpos { debug!("recovery_download: nothing to download"); diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs index 7dc7f48548..fe3eee8a5a 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -1,4 +1,5 @@ -use std::{ffi::CStr, sync::Arc}; +use std::ffi::CStr; +use std::sync::Arc; use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::v16::wal_generator::{LogicalMessageGenerator, WalGenerator}; diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py deleted file mode 100644 index 915eb33673..0000000000 --- a/scripts/generate_image_maps.py +++ /dev/null @@ -1,61 +0,0 @@ -import itertools -import json -import os - -build_tag = os.environ["BUILD_TAG"] -branch = os.environ["BRANCH"] -dev_acr = os.environ["DEV_ACR"] -prod_acr = os.environ["PROD_ACR"] -dev_aws = os.environ["DEV_AWS"] -prod_aws = os.environ["PROD_AWS"] -aws_region = os.environ["AWS_REGION"] - -components = { - "neon": ["neon"], - "compute": [ - "compute-node-v14", - "compute-node-v15", - "compute-node-v16", - "compute-node-v17", - "vm-compute-node-v14", - "vm-compute-node-v15", - "vm-compute-node-v16", - "vm-compute-node-v17", - ], -} - -registries = { - "dev": [ - "docker.io/neondatabase", - f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com", - f"{dev_acr}.azurecr.io/neondatabase", - ], - "prod": [ - f"{prod_aws}.dkr.ecr.{aws_region}.amazonaws.com", - f"{prod_acr}.azurecr.io/neondatabase", - ], -} - -outputs: dict[str, dict[str, list[str]]] = {} - -target_tags = [build_tag, "latest"] if branch == "main" else [build_tag] -target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"] - -for component_name, component_images in components.items(): - for stage in target_stages: - outputs[f"{component_name}-{stage}"] = dict( - [ - ( - f"docker.io/neondatabase/{component_image}:{build_tag}", - [ - f"{combo[0]}/{component_image}:{combo[1]}" - for combo in itertools.product(registries[stage], target_tags) - ], - ) - for component_image in component_images - ] - ) - -with open(os.environ["GITHUB_OUTPUT"], "a") as f: - for key, value in outputs.items(): - f.write(f"{key}={json.dumps(value)}\n") diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index 17d4aed63b..e4db9a317d 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_broker" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index 1a6fb7fedf..86f2dd9a6c 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -1,18 +1,14 @@ -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, Instant}; use clap::Parser; - -use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::{ - FilterTenantTimelineId, MessageType, SubscribeByFilterRequest, + FilterTenantTimelineId, MessageType, SafekeeperTimelineInfo, SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage, }; - use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT}; use tokio::time; - use tonic::Request; const ABOUT: &str = r#" diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 9d4c22484c..cc33ec20ff 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -10,7 +10,14 @@ //! //! Only safekeeper message is supported, but it is not hard to add something //! else with generics. -use clap::{command, Parser}; +use std::collections::HashMap; +use std::convert::Infallible; +use std::net::SocketAddr; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; + +use clap::{Parser, command}; use futures_core::Stream; use futures_util::StreamExt; use http_body_util::Full; @@ -19,27 +26,10 @@ use hyper::header::CONTENT_TYPE; use hyper::service::service_fn; use hyper::{Method, StatusCode}; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; -use parking_lot::RwLock; -use std::collections::HashMap; -use std::convert::Infallible; -use std::net::SocketAddr; -use std::pin::Pin; -use std::sync::Arc; -use std::time::Duration; -use tokio::net::TcpListener; -use tokio::sync::broadcast; -use tokio::sync::broadcast::error::RecvError; -use tokio::time; -use tonic::body::{self, empty_body, BoxBody}; -use tonic::codegen::Service; -use tonic::Code; -use tonic::{Request, Response, Status}; -use tracing::*; -use utils::signals::ShutdownSignals; - use metrics::{Encoder, TextEncoder}; +use parking_lot::RwLock; use storage_broker::metrics::{ - BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL, + BROADCAST_DROPPED_MESSAGES_TOTAL, BROADCASTED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL, }; use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer}; @@ -48,10 +38,19 @@ use storage_broker::proto::{ FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage, }; -use storage_broker::{parse_proto_ttid, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR}; +use storage_broker::{DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR, parse_proto_ttid}; +use tokio::net::TcpListener; +use tokio::sync::broadcast; +use tokio::sync::broadcast::error::RecvError; +use tokio::time; +use tonic::body::{self, BoxBody, empty_body}; +use tonic::codegen::Service; +use tonic::{Code, Request, Response, Status}; +use tracing::*; use utils::id::TenantTimelineId; use utils::logging::{self, LogFormat}; use utils::sentry_init::init_sentry; +use utils::signals::ShutdownSignals; use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); @@ -743,11 +742,12 @@ async fn main() -> Result<(), Box> { #[cfg(test)] mod tests { - use super::*; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use tokio::sync::broadcast::error::TryRecvError; use utils::id::{TenantId, TimelineId}; + use super::*; + fn msg(timeline_id: Vec) -> Message { Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo { safekeeper_id: 1, diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 3ac40f6e14..55d411f607 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -1,12 +1,11 @@ use std::time::Duration; -use tonic::codegen::StdError; -use tonic::transport::{ClientTlsConfig, Endpoint}; -use tonic::{transport::Channel, Status}; -use utils::id::{TenantId, TenantTimelineId, TimelineId}; -use proto::{ - broker_service_client::BrokerServiceClient, TenantTimelineId as ProtoTenantTimelineId, -}; +use proto::TenantTimelineId as ProtoTenantTimelineId; +use proto::broker_service_client::BrokerServiceClient; +use tonic::Status; +use tonic::codegen::StdError; +use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; // Code generated by protobuf. pub mod proto { @@ -20,11 +19,8 @@ pub mod proto { pub mod metrics; // Re-exports to avoid direct tonic dependency in user crates. -pub use tonic::Code; -pub use tonic::Request; -pub use tonic::Streaming; - pub use hyper::Uri; +pub use tonic::{Code, Request, Streaming}; pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051"; pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}"); diff --git a/storage_broker/src/metrics.rs b/storage_broker/src/metrics.rs index 1fd3dd5ad6..ecfb594eba 100644 --- a/storage_broker/src/metrics.rs +++ b/storage_broker/src/metrics.rs @@ -1,6 +1,6 @@ //! Broker metrics. -use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge}; +use metrics::{IntCounter, IntGauge, register_int_counter, register_int_gauge}; use once_cell::sync::Lazy; pub static NUM_PUBS: Lazy = Lazy::new(|| { diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 8e82996db1..8211bdce62 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_controller" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [[bin]] @@ -19,8 +19,10 @@ bytes.workspace = true chrono.workspace = true clap.workspace = true cron.workspace = true +clashmap.workspace = true fail.workspace = true futures.workspace = true +governor.workspace = true hex.workspace = true hyper0.workspace = true humantime.workspace = true diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs index f8a2790769..7888b18aa7 100644 --- a/storage_controller/client/src/control_api.rs +++ b/storage_controller/client/src/control_api.rs @@ -1,6 +1,7 @@ use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; use reqwest::{Method, Url}; -use serde::{de::DeserializeOwned, Serialize}; +use serde::Serialize; +use serde::de::DeserializeOwned; pub struct Client { base_url: Url, diff --git a/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/down.sql b/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/down.sql new file mode 100644 index 0000000000..8f75e8947e --- /dev/null +++ b/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/down.sql @@ -0,0 +1,2 @@ +DROP TABLE timelines; +DROP TABLE safekeeper_timeline_pending_ops; diff --git a/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/up.sql b/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/up.sql new file mode 100644 index 0000000000..82003ab292 --- /dev/null +++ b/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/up.sql @@ -0,0 +1,19 @@ +CREATE TABLE timelines ( + tenant_id VARCHAR NOT NULL, + timeline_id VARCHAR NOT NULL, + start_lsn pg_lsn NOT NULL, + generation INTEGER NOT NULL, + sk_set BIGINT[] NOT NULL, + new_sk_set BIGINT[], + cplane_notified_generation INTEGER NOT NULL, + deleted_at timestamptz, + PRIMARY KEY(tenant_id, timeline_id) +); +CREATE TABLE safekeeper_timeline_pending_ops ( + sk_id BIGINT NOT NULL, + tenant_id VARCHAR NOT NULL, + timeline_id VARCHAR NOT NULL, + generation INTEGER NOT NULL, + op_kind VARCHAR NOT NULL, + PRIMARY KEY(tenant_id, timeline_id, sk_id) +); diff --git a/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/down.sql b/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/down.sql new file mode 100644 index 0000000000..378e9f8c16 --- /dev/null +++ b/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/down.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers DROP https_port; diff --git a/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/up.sql b/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/up.sql new file mode 100644 index 0000000000..bb47b0b256 --- /dev/null +++ b/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/up.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers ADD https_port INTEGER; diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs index 226d4942e7..a630316f46 100644 --- a/storage_controller/src/background_node_operations.rs +++ b/storage_controller/src/background_node_operations.rs @@ -1,4 +1,5 @@ -use std::{borrow::Cow, fmt::Debug, fmt::Display}; +use std::borrow::Cow; +use std::fmt::{Debug, Display}; use tokio_util::sync::CancellationToken; use utils::id::NodeId; diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 5bc3c81f02..5ce4d63d77 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -1,7 +1,8 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::error::Error as _; use std::sync::Arc; -use std::{collections::HashMap, time::Duration}; +use std::time::Duration; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; @@ -12,11 +13,9 @@ use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShar use postgres_connection::parse_host_port; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; -use utils::{ - backoff::{self}, - id::{NodeId, TenantId}, -}; +use tracing::{Instrument, info_span}; +use utils::backoff::{self}; +use utils::id::{NodeId, TenantId}; use crate::service::Config; @@ -625,7 +624,16 @@ impl ComputeHook { MaybeSendResult::Transmit((request, lock)) => (request, lock), }; - let result = if let Some(notify_url) = &self.config.compute_hook_url { + let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url { + Some(if control_plane_url.ends_with('/') { + format!("{control_plane_url}notify-attach") + } else { + format!("{control_plane_url}/notify-attach") + }) + } else { + self.config.compute_hook_url.clone() + }; + let result = if let Some(notify_url) = &compute_hook_url { self.do_notify(notify_url, &request, cancel).await } else { self.do_notify_local(&request).await.map_err(|e| { diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs index 8b7be88078..bd4b8ba38f 100644 --- a/storage_controller/src/drain_utils.rs +++ b/storage_controller/src/drain_utils.rs @@ -1,15 +1,14 @@ -use std::{ - collections::{BTreeMap, HashMap}, - sync::Arc, -}; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; use pageserver_api::controller_api::{NodeSchedulingPolicy, ShardSchedulingPolicy}; -use utils::{id::NodeId, shard::TenantShardId}; +use utils::id::NodeId; +use utils::shard::TenantShardId; -use crate::{ - background_node_operations::OperationError, node::Node, scheduler::Scheduler, - tenant_shard::TenantShard, -}; +use crate::background_node_operations::OperationError; +use crate::node::Node; +use crate::scheduler::Scheduler; +use crate::tenant_shard::TenantShard; pub(crate) struct TenantShardIterator { tenants_accessor: F, @@ -188,10 +187,8 @@ impl TenantShardDrain { mod tests { use std::sync::Arc; - use utils::{ - id::TenantId, - shard::{ShardCount, ShardNumber, TenantShardId}, - }; + use utils::id::TenantId; + use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use super::TenantShardIterator; diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 52b6110667..ee4c9ef9cd 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -1,24 +1,23 @@ -use futures::{stream::FuturesUnordered, StreamExt}; +use std::collections::HashMap; +use std::fmt::Debug; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use pageserver_api::controller_api::{NodeAvailability, SkSchedulingPolicy}; +use pageserver_api::models::PageserverUtilization; +use reqwest::Certificate; use safekeeper_api::models::SafekeeperUtilization; use safekeeper_client::mgmt_api; -use std::{ - collections::HashMap, - fmt::Debug, - future::Future, - sync::Arc, - time::{Duration, Instant}, -}; -use tokio_util::sync::CancellationToken; - -use pageserver_api::{ - controller_api::{NodeAvailability, SkSchedulingPolicy}, - models::PageserverUtilization, -}; - use thiserror::Error; -use utils::{id::NodeId, logging::SecretString}; +use tokio_util::sync::CancellationToken; +use utils::id::NodeId; +use utils::logging::SecretString; -use crate::{node::Node, safekeeper::Safekeeper}; +use crate::node::Node; +use crate::safekeeper::Safekeeper; struct HeartbeaterTask { receiver: tokio::sync::mpsc::UnboundedReceiver>, @@ -29,6 +28,7 @@ struct HeartbeaterTask { max_offline_interval: Duration, max_warming_up_interval: Duration, jwt_token: Option, + ssl_ca_cert: Option, } #[derive(Debug, Clone)] @@ -77,6 +77,7 @@ where { pub(crate) fn new( jwt_token: Option, + ssl_ca_cert: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, cancel: CancellationToken, @@ -86,6 +87,7 @@ where let mut heartbeater = HeartbeaterTask::new( receiver, jwt_token, + ssl_ca_cert, max_offline_interval, max_warming_up_interval, cancel, @@ -121,6 +123,7 @@ where fn new( receiver: tokio::sync::mpsc::UnboundedReceiver>, jwt_token: Option, + ssl_ca_cert: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, cancel: CancellationToken, @@ -132,6 +135,7 @@ where max_offline_interval, max_warming_up_interval, jwt_token, + ssl_ca_cert, } } async fn run(&mut self) { @@ -174,6 +178,7 @@ impl HeartBeat for HeartbeaterTask let mut heartbeat_futs = FuturesUnordered::new(); for (node_id, node) in &*pageservers { heartbeat_futs.push({ + let ssl_ca_cert = self.ssl_ca_cert.clone(); let jwt_token = self.jwt_token.clone(); let cancel = self.cancel.clone(); @@ -189,6 +194,7 @@ impl HeartBeat for HeartbeaterTask .with_client_retries( |client| async move { client.get_utilization().await }, &jwt_token, + &ssl_ca_cert, 3, 3, Duration::from_secs(1), @@ -223,21 +229,21 @@ impl HeartBeat for HeartbeaterTask Some((*node_id, status)) } }); + } - loop { - let maybe_status = tokio::select! { - next = heartbeat_futs.next() => { - match next { - Some(result) => result, - None => { break; } - } - }, - _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } - }; + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; - if let Some((node_id, status)) = maybe_status { - new_state.insert(node_id, status); - } + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); } } @@ -327,6 +333,7 @@ impl HeartBeat for HeartbeaterTask for HeartbeaterTask for HeartbeaterTask { - match next { - Some(result) => result, - None => { break; } - } - }, - _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } - }; + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; - if let Some((node_id, status)) = maybe_status { - new_state.insert(node_id, status); - } + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); } } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 33b3d88c25..52e3ef5b0a 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1,61 +1,56 @@ -use crate::http; -use crate::metrics::{ - HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, - METRICS_REGISTRY, -}; -use crate::persistence::SafekeeperUpsert; -use crate::reconciler::ReconcileError; -use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; +use std::str::FromStr; +use std::sync::{Arc, LazyLock}; +use std::time::{Duration, Instant}; + use anyhow::Context; +use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; use futures::Future; -use http_utils::{ - endpoint::{ - self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, - request_span, - }, - error::ApiError, - failpoints::failpoints_handler, - json::{json_request, json_response}, - request::{must_get_query_param, parse_query_param, parse_request_param}, - RequestExt, RouterBuilder, +use http_utils::endpoint::{ + self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, + request_span, }; +use http_utils::error::ApiError; +use http_utils::failpoints::failpoints_handler; +use http_utils::json::{json_request, json_response}; +use http_utils::request::{must_get_query_param, parse_query_param, parse_request_param}; +use http_utils::{RequestExt, RouterBuilder}; use hyper::header::CONTENT_TYPE; -use hyper::{Body, Request, Response}; -use hyper::{StatusCode, Uri}; +use hyper::{Body, Request, Response, StatusCode, Uri}; use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::controller_api::{ MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse, MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, - SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, + ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, }; use pageserver_api::models::{ - TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, + DetachBehavior, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; -use pageserver_client::{mgmt_api, BlockUnblock}; -use std::str::FromStr; -use std::sync::Arc; -use std::time::{Duration, Instant}; +use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; +use pageserver_client::{BlockUnblock, mgmt_api}; +use routerify::Middleware; use tokio_util::sync::CancellationToken; +use tracing::warn; use utils::auth::{Scope, SwappableJwtAuth}; use utils::id::{NodeId, TenantId, TimelineId}; -use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, - TenantShardMigrateRequest, +use crate::http; +use crate::metrics::{ + HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, METRICS_REGISTRY, + PageserverRequestLabelGroup, }; -use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; - -use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; - -use routerify::Middleware; +use crate::persistence::SafekeeperUpsert; +use crate::reconciler::ReconcileError; +use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service}; /// State available to HTTP request handlers pub struct HttpState { service: Arc, auth: Option>, + rate_limiter: governor::DefaultKeyedRateLimiter, neon_metrics: NeonMetrics, allowlist_routes: &'static [&'static str], } @@ -66,9 +61,11 @@ impl HttpState { auth: Option>, build_info: BuildInfo, ) -> Self { + let quota = governor::Quota::per_second(service.get_config().tenant_rate_limit); Self { service, auth, + rate_limiter: governor::RateLimiter::keyed(quota), neon_metrics: NeonMetrics::new(build_info), allowlist_routes: &[ "/status", @@ -89,6 +86,40 @@ fn get_state(request: &Request) -> &HttpState { .as_ref() } +/// Rate limits tenant requests. +/// +/// TODO: this should be a request middleware, but requires us to extract the tenant ID from +/// different URLs in a systematic way. +/// +/// TODO: consider returning a 429 response if these start piling up. +async fn maybe_rate_limit(request: &Request, tenant_id: TenantId) { + // Check if the tenant should be rate-limited. + let rate_limiter = &get_state(request).rate_limiter; + if rate_limiter.check_key(&tenant_id).is_ok() { + return; + } + + // Measure the rate limiting delay. + let _timer = METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_rate_limited + .start_timer(); + + // Log rate limited tenants once every 10 seconds. + static LOG_RATE_LIMITER: LazyLock> = + LazyLock::new(|| { + let quota = governor::Quota::with_period(Duration::from_secs(10)).unwrap(); + governor::RateLimiter::keyed(quota) + }); + + if LOG_RATE_LIMITER.check_key(&tenant_id).is_ok() { + warn!("tenant {tenant_id} is rate limited") + } + + // Wait for quota. + rate_limiter.until_key_ready(&tenant_id).await; +} + /// Pageserver calls into this on startup, to learn which tenants it should attach async fn handle_re_attach(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; @@ -254,6 +285,7 @@ async fn handle_tenant_config_get( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -271,6 +303,7 @@ async fn handle_tenant_time_travel_remote_storage( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -318,6 +351,7 @@ async fn handle_tenant_secondary_download( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -336,6 +370,7 @@ async fn handle_tenant_delete( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -363,6 +398,7 @@ async fn handle_tenant_timeline_create( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -388,6 +424,7 @@ async fn handle_tenant_timeline_delete( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -464,6 +501,7 @@ async fn handle_tenant_timeline_archival_config( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -487,8 +525,10 @@ async fn handle_tenant_timeline_detach_ancestor( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + let behavior: Option = parse_query_param(&req, "detach_behavior")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -498,7 +538,7 @@ async fn handle_tenant_timeline_detach_ancestor( }; let res = service - .tenant_timeline_detach_ancestor(tenant_id, timeline_id) + .tenant_timeline_detach_ancestor(tenant_id, timeline_id, behavior) .await?; json_response(StatusCode::OK, res) @@ -511,6 +551,7 @@ async fn handle_tenant_timeline_block_unblock_gc( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; @@ -528,12 +569,14 @@ async fn handle_tenant_timeline_download_heatmap_layers( let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_shard_id.tenant_id).await; let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; let concurrency: Option = parse_query_param(&req, "concurrency")?; + let recurse = parse_query_param(&req, "recurse")?.unwrap_or(false); service - .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency, recurse) .await?; json_response(StatusCode::OK, ()) @@ -554,8 +597,9 @@ async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let tenant_or_shard_id: TenantShardId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_or_shard_id.tenant_id).await; let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -569,15 +613,28 @@ async fn handle_tenant_timeline_passthrough( return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); }; - tracing::info!("Proxying request for tenant {} ({})", tenant_id, path); + tracing::info!( + "Proxying request for tenant {} ({})", + tenant_or_shard_id.tenant_id, + path + ); // Find the node that holds shard zero - let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?; + let (node, tenant_shard_id) = if tenant_or_shard_id.is_unsharded() { + service + .tenant_shard0_node(tenant_or_shard_id.tenant_id) + .await? + } else { + ( + service.tenant_shard_node(tenant_or_shard_id).await?, + tenant_or_shard_id, + ) + }; // Callers will always pass an unsharded tenant ID. Before proxying, we must // rewrite this to a shard-aware shard zero ID. let path = format!("{}", path); - let tenant_str = tenant_id.to_string(); + let tenant_str = tenant_or_shard_id.tenant_id.to_string(); let tenant_shard_str = format!("{}", tenant_shard_id); let path = path.replace(&tenant_str, &tenant_shard_str); @@ -601,7 +658,9 @@ async fn handle_tenant_timeline_passthrough( let client = mgmt_api::Client::new( node.base_url(), service.get_config().pageserver_jwt_token.as_deref(), - ); + service.get_config().ssl_ca_cert.clone(), + ) + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; let resp = client.get_raw(path).await.map_err(|e| // We return 503 here because if we can't successfully send a request to the pageserver, // either we aren't available or the pageserver is unavailable. @@ -617,7 +676,7 @@ async fn handle_tenant_timeline_passthrough( // Transform 404 into 503 if we raced with a migration if resp.status() == reqwest::StatusCode::NOT_FOUND { // Look up node again: if we migrated it will be different - let (new_node, _tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?; + let new_node = service.tenant_shard_node(tenant_shard_id).await?; if new_node.get_id() != node.get_id() { // Rather than retry here, send the client a 503 to prompt a retry: this matches // the pageserver's use of 503, and all clients calling this API should retry on 503. @@ -647,6 +706,7 @@ async fn handle_tenant_locate( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -662,9 +722,9 @@ async fn handle_tenant_describe( service: Arc, req: Request, ) -> Result, ApiError> { - check_permissions(&req, Scope::Scrubber)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::Scrubber)?; + // NB: don't rate limit: scrubber operation. match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -999,6 +1059,7 @@ async fn handle_tenant_shard_split( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1021,6 +1082,7 @@ async fn handle_tenant_shard_migrate( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1044,6 +1106,7 @@ async fn handle_tenant_shard_migrate_secondary( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1067,6 +1130,7 @@ async fn handle_tenant_shard_cancel_reconcile( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1086,6 +1150,7 @@ async fn handle_tenant_shard_cancel_reconcile( async fn handle_tenant_update_policy(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1141,9 +1206,9 @@ async fn handle_step_down(req: Request) -> Result, ApiError } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::PageServerApi)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1158,9 +1223,9 @@ async fn handle_tenant_drop(req: Request) -> Result, ApiErr } async fn handle_tenant_import(req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::PageServerApi)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1455,8 +1520,8 @@ pub fn prologue_leadership_status_check_middleware< }) } -fn prologue_metrics_middleware( -) -> Middleware { +fn prologue_metrics_middleware() +-> Middleware { Middleware::pre(move |req| async move { let meta = RequestMeta { method: req.method().clone(), @@ -1469,8 +1534,8 @@ fn prologue_metrics_middleware }) } -fn epilogue_metrics_middleware( -) -> Middleware { +fn epilogue_metrics_middleware() +-> Middleware { Middleware::post_with_info(move |resp, req_info| async move { let request_name = match req_info.context::() { Some(name) => name, @@ -1621,8 +1686,8 @@ async fn maybe_forward(req: Request) -> ForwardOutcome { Err(err) => { return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( anyhow::anyhow!( - "Failed to parse leader uri for forwarding while in stepped down state: {err}" - ), + "Failed to parse leader uri for forwarding while in stepped down state: {err}" + ), ))); } }; @@ -2155,8 +2220,23 @@ mod test { #[test] fn test_path_without_ids() { - assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/"); - assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/"); - assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo"), "/v1/tenant//timeline/"); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788" + ), + "/v1/tenant//timeline/" + ); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788" + ), + "/v1/tenant//timeline/" + ); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo" + ), + "/v1/tenant//timeline/" + ); } } diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs index 2d8b674f86..6b0c16f0be 100644 --- a/storage_controller/src/id_lock_map.rs +++ b/storage_controller/src/id_lock_map.rs @@ -1,8 +1,7 @@ +use std::collections::HashMap; use std::fmt::Display; -use std::time::Instant; -use std::{collections::HashMap, sync::Arc}; - -use std::time::Duration; +use std::sync::Arc; +use std::time::{Duration, Instant}; use crate::service::RECONCILE_TIMEOUT; diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs index 5fae8991ec..5e1d6f3ec9 100644 --- a/storage_controller/src/leadership.rs +++ b/storage_controller/src/leadership.rs @@ -3,11 +3,9 @@ use std::sync::Arc; use hyper::Uri; use tokio_util::sync::CancellationToken; -use crate::{ - peer_client::{GlobalObservedState, PeerClient}, - persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence}, - service::Config, -}; +use crate::peer_client::{GlobalObservedState, PeerClient}; +use crate::persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence}; +use crate::service::Config; /// Helper for storage controller leadership acquisition pub(crate) struct Leadership { @@ -91,7 +89,9 @@ impl Leadership { // Special case: if this is a brand new storage controller, migrations will not // have run at this point yet, and, hence, the controllers table does not exist. // Detect this case via the error string (diesel doesn't type it) and allow it. - tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ..."); + tracing::info!( + "Detected first storage controller start-up. Allowing missing controllers table ..." + ); return Ok(None); } } diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 4152e40a76..6e3c70c42b 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,26 +1,28 @@ -use anyhow::{anyhow, Context}; -use clap::Parser; -use hyper0::Uri; -use metrics::launch_timestamp::LaunchTimestamp; -use metrics::BuildInfo; +use std::num::NonZeroU32; use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; + +use anyhow::{Context, anyhow}; +use clap::Parser; +use hyper0::Uri; +use metrics::BuildInfo; +use metrics::launch_timestamp::LaunchTimestamp; +use reqwest::Certificate; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, + Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, - PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; - use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version, tcp_listener}; @@ -34,7 +36,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; #[derive(Parser)] @@ -69,6 +71,10 @@ struct Cli { #[arg(long)] compute_hook_url: Option, + /// URL to control plane storage API prefix + #[arg(long)] + control_plane_url: Option, + /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller #[arg(long)] database_url: Option, @@ -98,6 +104,10 @@ struct Cli { #[arg(long)] priority_reconciler_concurrency: Option, + /// Tenant API rate limit, as requests per second per tenant. + #[arg(long, default_value = "10")] + tenant_rate_limit: NonZeroU32, + /// How long to wait for the initial database connection to be available. #[arg(long, default_value = "5s")] db_connect_timeout: humantime::Duration, @@ -123,21 +133,33 @@ struct Cli { #[arg(long)] chaos_exit_crontab: Option, - // Maximum acceptable lag for the secondary location while draining - // a pageserver + /// Maximum acceptable lag for the secondary location while draining + /// a pageserver #[arg(long)] max_secondary_lag_bytes: Option, - // Period with which to send heartbeats to registered nodes + /// Period with which to send heartbeats to registered nodes #[arg(long)] heartbeat_interval: Option, #[arg(long)] long_reconcile_threshold: Option, - // Flag to use https for requests to pageserver API. + /// Flag to use https for requests to pageserver API. #[arg(long, default_value = "false")] use_https_pageserver_api: bool, + + // Whether to put timelines onto safekeepers + #[arg(long, default_value = "false")] + timelines_onto_safekeepers: bool, + + /// Flag to use https for requests to safekeeper API. + #[arg(long, default_value = "false")] + use_https_safekeeper_api: bool, + + /// Trusted root CA certificate to use in https APIs. + #[arg(long)] + ssl_ca_file: Option, } enum StrictMode { @@ -281,30 +303,27 @@ async fn async_main() -> anyhow::Result<()> { let secrets = Secrets::load(&args).await?; - // TODO: once we've rolled out the safekeeper JWT token everywhere, put it into the validation code below - tracing::info!( - "safekeeper_jwt_token set: {:?}", - secrets.safekeeper_jwt_token.is_some() - ); - // Validate required secrets and arguments are provided in strict mode match strict_mode { StrictMode::Strict if (secrets.public_key.is_none() || secrets.pageserver_jwt_token.is_none() - || secrets.control_plane_jwt_token.is_none()) => + || secrets.control_plane_jwt_token.is_none() + || secrets.safekeeper_jwt_token.is_none()) => { // Production systems should always have secrets configured: if public_key was not set // then we would implicitly disable auth. anyhow::bail!( - "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" - ); + "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" + ); } - StrictMode::Strict if args.compute_hook_url.is_none() => { - // Production systems should always have a compute hook set, to prevent falling + StrictMode::Strict + if args.compute_hook_url.is_none() && args.control_plane_url.is_none() => + { + // Production systems should always have a control plane URL set, to prevent falling // back to trying to use neon_local. anyhow::bail!( - "`--compute-hook-url` is not set: this is only permitted in `--dev` mode" + "neither `--compute-hook-url` nor `--control-plane-url` are set: this is only permitted in `--dev` mode" ); } StrictMode::Strict => { @@ -315,12 +334,22 @@ async fn async_main() -> anyhow::Result<()> { } } + let ssl_ca_cert = match args.ssl_ca_file.as_ref() { + Some(ssl_ca_file) => { + tracing::info!("Using ssl root CA file: {ssl_ca_file:?}"); + let buf = tokio::fs::read(ssl_ca_file).await?; + Some(Certificate::from_pem(&buf)?) + } + None => None, + }; + let config = Config { pageserver_jwt_token: secrets.pageserver_jwt_token, safekeeper_jwt_token: secrets.safekeeper_jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, peer_jwt_token: secrets.peer_jwt_token, compute_hook_url: args.compute_hook_url, + control_plane_url: args.control_plane_url, max_offline_interval: args .max_offline_interval .map(humantime::Duration::into) @@ -335,6 +364,7 @@ async fn async_main() -> anyhow::Result<()> { priority_reconciler_concurrency: args .priority_reconciler_concurrency .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT), + tenant_rate_limit: args.tenant_rate_limit, split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, @@ -350,6 +380,9 @@ async fn async_main() -> anyhow::Result<()> { start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, use_https_pageserver_api: args.use_https_pageserver_api, + use_https_safekeeper_api: args.use_https_safekeeper_api, + ssl_ca_cert, + timelines_onto_safekeepers: args.timelines_onto_safekeepers, }; // Validate that we can connect to the database diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 6d67e0d130..ea390df726 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -7,17 +7,18 @@ //! //! The rest of the code defines label group types and deals with converting outer types to labels. //! +use std::sync::Mutex; + use bytes::Bytes; -use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use measured::label::LabelValue; +use measured::metric::histogram; +use measured::{FixedCardinalityLabel, MetricGroup}; use metrics::NeonMetrics; use once_cell::sync::Lazy; -use std::sync::Mutex; use strum::IntoEnumIterator; -use crate::{ - persistence::{DatabaseError, DatabaseOperation}, - service::LeadershipStatus, -}; +use crate::persistence::{DatabaseError, DatabaseOperation}; +use crate::service::LeadershipStatus; pub(crate) static METRICS_REGISTRY: Lazy = Lazy::new(StorageControllerMetrics::default); @@ -75,6 +76,10 @@ pub(crate) struct StorageControllerMetricGroup { pub(crate) storage_controller_http_request_latency: measured::HistogramVec, + /// HTTP rate limiting latency across all tenants and endpoints + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 10.0))] + pub(crate) storage_controller_http_request_rate_limited: measured::Histogram<10>, + /// Count of HTTP requests to the pageserver that resulted in an error, /// broken down by the pageserver node id, request name and method pub(crate) storage_controller_pageserver_request_error: diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 3762d13c10..40f3c7c58e 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -1,22 +1,21 @@ -use std::{str::FromStr, time::Duration}; +use std::str::FromStr; +use std::time::Duration; -use anyhow::anyhow; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, - NodeSchedulingPolicy, TenantLocateResponseShard, - }, - shard::TenantShardId, +use pageserver_api::controller_api::{ + AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, + NodeSchedulingPolicy, TenantLocateResponseShard, }; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; -use reqwest::StatusCode; +use reqwest::{Certificate, StatusCode}; use serde::Serialize; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::NodeId}; +use utils::backoff; +use utils::id::NodeId; -use crate::{ - pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule, -}; +use crate::pageserver_client::PageserverClient; +use crate::persistence::NodePersistence; +use crate::scheduler::MaySchedule; /// Represents the in-memory description of a Node. /// @@ -211,7 +210,10 @@ impl Node { use_https: bool, ) -> anyhow::Result { if use_https && listen_https_port.is_none() { - return Err(anyhow!("https is enabled, but node has no https port")); + anyhow::bail!( + "cannot create node {id}: \ + https is enabled, but https port is not specified" + ); } Ok(Self { @@ -244,7 +246,11 @@ impl Node { pub(crate) fn from_persistent(np: NodePersistence, use_https: bool) -> anyhow::Result { if use_https && np.listen_https_port.is_none() { - return Err(anyhow!("https is enabled, but node has no https port")); + anyhow::bail!( + "cannot load node {} from persistent: \ + https is enabled, but https port is not specified", + np.node_id, + ); } Ok(Self { @@ -270,10 +276,12 @@ impl Node { /// This will return None to indicate cancellation. Cancellation may happen from /// the cancellation token passed in, or from Self's cancellation token (i.e. node /// going offline). + #[allow(clippy::too_many_arguments)] pub(crate) async fn with_client_retries( &self, mut op: O, jwt: &Option, + ssl_ca_cert: &Option, warn_threshold: u32, max_retries: u32, timeout: Duration, @@ -292,19 +300,26 @@ impl Node { | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, ApiError(_, _) => true, Cancelled => true, + CreateClient(_) => true, } } + // TODO: refactor PageserverClient and with_client_retires (#11113). + let mut http_client = reqwest::ClientBuilder::new().timeout(timeout); + if let Some(ssl_ca_cert) = ssl_ca_cert.as_ref() { + http_client = http_client.add_root_certificate(ssl_ca_cert.clone()) + } + + let http_client = match http_client.build() { + Ok(http_client) => http_client, + Err(err) => return Some(Err(mgmt_api::Error::CreateClient(err))), + }; + backoff::retry( || { - let http_client = reqwest::ClientBuilder::new() - .timeout(timeout) - .build() - .expect("Failed to construct HTTP client"); - let client = PageserverClient::from_client( self.get_id(), - http_client, + http_client.clone(), self.base_url(), jwt.as_deref(), ); diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 645cbdfce1..05e7aa88c6 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,18 +1,14 @@ -use pageserver_api::{ - models::{ - detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, - PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, - TenantShardSplitRequest, TenantShardSplitResponse, TenantWaitLsnRequest, - TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, - TopTenantShardsResponse, - }, - shard::TenantShardId, +use pageserver_api::models::detach_ancestor::AncestorDetached; +use pageserver_api::models::{ + DetachBehavior, LocationConfig, LocationConfigListResponse, PageserverUtilization, + SecondaryProgress, TenantScanRemoteStorageResponse, TenantShardSplitRequest, + TenantShardSplitResponse, TenantWaitLsnRequest, TimelineArchivalConfigRequest, + TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, }; -use pageserver_client::{ - mgmt_api::{Client, Result}, - BlockUnblock, -}; -use reqwest::StatusCode; +use pageserver_api::shard::TenantShardId; +use pageserver_client::BlockUnblock; +use pageserver_client::mgmt_api::{Client, Result}; +use reqwest::{Certificate, StatusCode}; use utils::id::{NodeId, TenantId, TimelineId}; /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage @@ -50,11 +46,16 @@ macro_rules! measured_request { } impl PageserverClient { - pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { - Self { - inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), + pub(crate) fn new( + node_id: NodeId, + mgmt_api_endpoint: String, + jwt: Option<&str>, + ssl_ca_cert: Option, + ) -> Result { + Ok(Self { + inner: Client::new(mgmt_api_endpoint, jwt, ssl_ca_cert)?, node_id_label: node_id.0.to_string(), - } + }) } pub(crate) fn from_client( @@ -251,13 +252,14 @@ impl PageserverClient { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + behavior: Option, ) -> Result { measured_request!( "timeline_detach_ancestor", crate::metrics::Method::Put, &self.node_id_label, self.inner - .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .timeline_detach_ancestor(tenant_shard_id, timeline_id, behavior) .await ) } @@ -285,13 +287,19 @@ impl PageserverClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<()> { measured_request!( "download_heatmap_layers", crate::metrics::Method::Post, &self.node_id_label, self.inner - .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .timeline_download_heatmap_layers( + tenant_shard_id, + timeline_id, + concurrency, + recurse + ) .await ) } diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs index 1a15bae365..f3f275dee0 100644 --- a/storage_controller/src/peer_client.rs +++ b/storage_controller/src/peer_client.rs @@ -1,16 +1,17 @@ -use crate::tenant_shard::ObservedState; -use pageserver_api::shard::TenantShardId; -use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::error::Error as _; use std::time::Duration; -use tokio_util::sync::CancellationToken; use http_utils::error::HttpErrorBody; use hyper::Uri; +use pageserver_api::shard::TenantShardId; use reqwest::{StatusCode, Url}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; use utils::backoff; +use crate::tenant_shard::ObservedState; + #[derive(Debug, Clone)] pub(crate) struct PeerClient { uri: Uri, diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 459c11add9..85d9c574a1 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1,46 +1,45 @@ pub(crate) mod split_state; use std::collections::HashMap; +use std::io::Write; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; +use std::time::{Duration, Instant}; -use self::split_state::SplitState; +use diesel::deserialize::{FromSql, FromSqlRow}; +use diesel::expression::AsExpression; +use diesel::pg::Pg; use diesel::prelude::*; +use diesel::serialize::{IsNull, ToSql}; use diesel_async::async_connection_wrapper::AsyncConnectionWrapper; use diesel_async::pooled_connection::bb8::Pool; -use diesel_async::pooled_connection::AsyncDieselConnectionManager; -use diesel_async::pooled_connection::ManagerConfig; -use diesel_async::AsyncPgConnection; -use diesel_async::RunQueryDsl; -use futures::future::BoxFuture; +use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig}; +use diesel_async::{AsyncPgConnection, RunQueryDsl}; +use diesel_migrations::{EmbeddedMigrations, embed_migrations}; use futures::FutureExt; +use futures::future::BoxFuture; use itertools::Itertools; -use pageserver_api::controller_api::AvailabilityZone; -use pageserver_api::controller_api::MetadataHealthRecord; -use pageserver_api::controller_api::SafekeeperDescribeResponse; -use pageserver_api::controller_api::ShardSchedulingPolicy; -use pageserver_api::controller_api::SkSchedulingPolicy; -use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; +use pageserver_api::controller_api::{ + AvailabilityZone, MetadataHealthRecord, NodeSchedulingPolicy, PlacementPolicy, + SafekeeperDescribeResponse, ShardSchedulingPolicy, SkSchedulingPolicy, +}; use pageserver_api::models::TenantConfig; -use pageserver_api::shard::ShardConfigError; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::shard::ShardStripeSize; -use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; -use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; +use pageserver_api::shard::{ + ShardConfigError, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, +}; use rustls::client::WebPkiServerVerifier; +use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; use rustls::crypto::ring; use scoped_futures::ScopedBoxFuture; use serde::{Deserialize, Serialize}; use utils::generation::Generation; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; +use self::split_state::SplitState; use crate::metrics::{ DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY, }; use crate::node::Node; - -use diesel_migrations::{embed_migrations, EmbeddedMigrations}; const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); /// ## What do we store? @@ -122,6 +121,11 @@ pub(crate) enum DatabaseOperation { GetLeader, UpdateLeader, SetPreferredAzs, + InsertTimeline, + GetTimeline, + InsertTimelineReconcile, + RemoveTimelineReconcile, + ListTimelineReconcile, } #[must_use] @@ -479,8 +483,7 @@ impl Persistence { &self, shards: Vec, ) -> DatabaseResult<()> { - use crate::schema::metadata_health; - use crate::schema::tenant_shards; + use crate::schema::{metadata_health, tenant_shards}; let now = chrono::Utc::now(); @@ -554,8 +557,7 @@ impl Persistence { &self, input_node_id: NodeId, ) -> DatabaseResult> { - use crate::schema::nodes::dsl::scheduling_policy; - use crate::schema::nodes::dsl::*; + use crate::schema::nodes::dsl::{scheduling_policy, *}; use crate::schema::tenant_shards::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { @@ -965,10 +967,26 @@ impl Persistence { &self, split_tenant_id: TenantId, old_shard_count: ShardCount, + new_shard_count: ShardCount, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| { Box::pin(async move { + // Sanity: child shards must still exist, as we're deleting parent shards + let child_shards_query = tenant_shards + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)); + let child_shards = child_shards_query + .load::(conn) + .await?; + if child_shards.len() != new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected child shard count {} while completing split to \ + count {new_shard_count:?} on tenant {split_tenant_id}", + child_shards.len() + ))); + } + // Drop parent shards diesel::delete(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) @@ -1283,6 +1301,166 @@ impl Persistence { }) .await } + + /// Persist timeline. Returns if the timeline was newly inserted. If it wasn't, we haven't done any writes. + pub(crate) async fn insert_timeline(&self, entry: TimelinePersistence) -> DatabaseResult { + use crate::schema::timelines; + + let entry = &entry; + self.with_measured_conn(DatabaseOperation::InsertTimeline, move |conn| { + Box::pin(async move { + let inserted_updated = diesel::insert_into(timelines::table) + .values(entry) + .on_conflict((timelines::tenant_id, timelines::timeline_id)) + .do_nothing() + .execute(conn) + .await?; + + match inserted_updated { + 0 => Ok(false), + 1 => Ok(true), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + inserted_updated + ))), + } + }) + }) + .await + } + + /// Load timeline from db. Returns `None` if not present. + pub(crate) async fn get_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult> { + use crate::schema::timelines::dsl; + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + let timeline_from_db = self + .with_measured_conn(DatabaseOperation::GetTimeline, move |conn| { + Box::pin(async move { + let mut from_db: Vec = dsl::timelines + .filter( + dsl::tenant_id + .eq(&tenant_id.to_string()) + .and(dsl::timeline_id.eq(&timeline_id.to_string())), + ) + .load(conn) + .await?; + if from_db.is_empty() { + return Ok(None); + } + if from_db.len() != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + from_db.len() + ))); + } + + Ok(Some(from_db.pop().unwrap().into_persistence())) + }) + }) + .await?; + + Ok(timeline_from_db) + } + /// Persist pending op. Returns if it was newly inserted. If it wasn't, we haven't done any writes. + pub(crate) async fn insert_pending_op( + &self, + entry: TimelinePendingOpPersistence, + ) -> DatabaseResult { + use crate::schema::safekeeper_timeline_pending_ops as skpo; + // This overrides the `filter` fn used in other functions, so contain the mayhem via a function-local use + use diesel::query_dsl::methods::FilterDsl; + + let entry = &entry; + self.with_measured_conn(DatabaseOperation::InsertTimelineReconcile, move |conn| { + Box::pin(async move { + // For simplicity it makes sense to keep only the last operation + // per (tenant, timeline, sk) tuple: if we migrated a timeline + // from node and adding it back it is not necessary to remove + // data on it. Hence, generation is not part of primary key and + // we override any rows with lower generations here. + let inserted_updated = diesel::insert_into(skpo::table) + .values(entry) + .on_conflict((skpo::tenant_id, skpo::timeline_id, skpo::sk_id)) + .do_update() + .set(entry) + .filter(skpo::generation.lt(entry.generation)) + .execute(conn) + .await?; + + match inserted_updated { + 0 => Ok(false), + 1 => Ok(true), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + inserted_updated + ))), + } + }) + }) + .await + } + /// Remove persisted pending op. + pub(crate) async fn remove_pending_op( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + sk_id: NodeId, + generation: u32, + ) -> DatabaseResult<()> { + use crate::schema::safekeeper_timeline_pending_ops::dsl; + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + self.with_measured_conn(DatabaseOperation::RemoveTimelineReconcile, move |conn| { + Box::pin(async move { + diesel::delete(dsl::safekeeper_timeline_pending_ops) + .filter(dsl::tenant_id.eq(tenant_id.to_string())) + .filter(dsl::timeline_id.eq(timeline_id.to_string())) + .filter(dsl::sk_id.eq(sk_id.0 as i64)) + .filter(dsl::generation.eq(generation as i32)) + .execute(conn) + .await?; + Ok(()) + }) + }) + .await + } + + /// Load pending operations from db. + pub(crate) async fn list_pending_ops( + &self, + filter_for_sk: Option, + ) -> DatabaseResult> { + use crate::schema::safekeeper_timeline_pending_ops::dsl; + + const FILTER_VAL_1: i64 = 1; + const FILTER_VAL_2: i64 = 2; + let filter_opt = filter_for_sk.map(|id| id.0 as i64); + let timeline_from_db = self + .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| { + Box::pin(async move { + let from_db: Vec = + dsl::safekeeper_timeline_pending_ops + .filter( + dsl::sk_id + .eq(filter_opt.unwrap_or(FILTER_VAL_1)) + .and(dsl::sk_id.eq(filter_opt.unwrap_or(FILTER_VAL_2))), + ) + .load(conn) + .await?; + Ok(from_db) + }) + }) + .await?; + + Ok(timeline_from_db) + } } pub(crate) fn load_certs() -> anyhow::Result> { @@ -1451,23 +1629,49 @@ pub(crate) struct TenantShardPersistence { } impl TenantShardPersistence { + fn get_shard_count(&self) -> Result { + self.shard_count + .try_into() + .map(ShardCount) + .map_err(|_| ShardConfigError::InvalidCount) + } + + fn get_shard_number(&self) -> Result { + self.shard_number + .try_into() + .map(ShardNumber) + .map_err(|_| ShardConfigError::InvalidNumber) + } + + fn get_stripe_size(&self) -> Result { + self.shard_stripe_size + .try_into() + .map(ShardStripeSize) + .map_err(|_| ShardConfigError::InvalidStripeSize) + } + pub(crate) fn get_shard_identity(&self) -> Result { if self.shard_count == 0 { - Ok(ShardIdentity::unsharded()) + // NB: carry over the stripe size from the persisted record, to avoid consistency check + // failures if the persisted value differs from the default stripe size. The stripe size + // doesn't really matter for unsharded tenants anyway. + Ok(ShardIdentity::unsharded_with_stripe_size( + self.get_stripe_size()?, + )) } else { Ok(ShardIdentity::new( - ShardNumber(self.shard_number as u8), - ShardCount::new(self.shard_count as u8), - ShardStripeSize(self.shard_stripe_size as u32), + self.get_shard_number()?, + self.get_shard_count()?, + self.get_stripe_size()?, )?) } } - pub(crate) fn get_tenant_shard_id(&self) -> Result { + pub(crate) fn get_tenant_shard_id(&self) -> anyhow::Result { Ok(TenantShardId { tenant_id: TenantId::from_str(self.tenant_id.as_str())?, - shard_number: ShardNumber(self.shard_number as u8), - shard_count: ShardCount::new(self.shard_count as u8), + shard_number: self.get_shard_number()?, + shard_count: self.get_shard_count()?, }) } } @@ -1565,7 +1769,34 @@ pub(crate) struct SafekeeperPersistence { pub(crate) port: i32, pub(crate) http_port: i32, pub(crate) availability_zone_id: String, - pub(crate) scheduling_policy: String, + pub(crate) scheduling_policy: SkSchedulingPolicyFromSql, + pub(crate) https_port: Option, +} + +/// Wrapper struct around [`SkSchedulingPolicy`] because both it and [`FromSql`] are from foreign crates, +/// and we don't want to make [`safekeeper_api`] depend on [`diesel`]. +#[derive(Serialize, Deserialize, FromSqlRow, Eq, PartialEq, Debug, Copy, Clone)] +pub(crate) struct SkSchedulingPolicyFromSql(pub(crate) SkSchedulingPolicy); + +impl From for SkSchedulingPolicyFromSql { + fn from(value: SkSchedulingPolicy) -> Self { + SkSchedulingPolicyFromSql(value) + } +} + +impl FromSql for SkSchedulingPolicyFromSql { + fn from_sql( + bytes: ::RawValue<'_>, + ) -> diesel::deserialize::Result { + let bytes = bytes.as_bytes(); + match core::str::from_utf8(bytes) { + Ok(s) => match SkSchedulingPolicy::from_str(s) { + Ok(policy) => Ok(SkSchedulingPolicyFromSql(policy)), + Err(e) => Err(format!("can't parse: {e}").into()), + }, + Err(e) => Err(format!("invalid UTF-8 for scheduling policy: {e}").into()), + } + } } impl SafekeeperPersistence { @@ -1580,15 +1811,12 @@ impl SafekeeperPersistence { host: upsert.host, port: upsert.port, http_port: upsert.http_port, + https_port: upsert.https_port, availability_zone_id: upsert.availability_zone_id, - scheduling_policy: String::from(scheduling_policy), + scheduling_policy: SkSchedulingPolicyFromSql(scheduling_policy), } } pub(crate) fn as_describe_response(&self) -> Result { - let scheduling_policy = - SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| { - DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}")) - })?; Ok(SafekeeperDescribeResponse { id: NodeId(self.id as u64), region_id: self.region_id.clone(), @@ -1596,8 +1824,9 @@ impl SafekeeperPersistence { host: self.host.clone(), port: self.port, http_port: self.http_port, + https_port: self.https_port, availability_zone_id: self.availability_zone_id.clone(), - scheduling_policy, + scheduling_policy: self.scheduling_policy.0, }) } } @@ -1616,6 +1845,7 @@ pub(crate) struct SafekeeperUpsert { /// The active flag will not be stored in the database and will be ignored. pub(crate) active: Option, pub(crate) http_port: i32, + pub(crate) https_port: Option, pub(crate) availability_zone_id: String, } @@ -1631,6 +1861,7 @@ impl SafekeeperUpsert { host: &self.host, port: self.port, http_port: self.http_port, + https_port: self.https_port, availability_zone_id: &self.availability_zone_id, // None means a wish to not update this column. We expose abilities to update it via other means. scheduling_policy: None, @@ -1647,6 +1878,143 @@ struct InsertUpdateSafekeeper<'a> { host: &'a str, port: i32, http_port: i32, + https_port: Option, availability_zone_id: &'a str, scheduling_policy: Option<&'a str>, } + +#[derive(Serialize, Deserialize, FromSqlRow, AsExpression, Eq, PartialEq, Debug, Copy, Clone)] +#[diesel(sql_type = crate::schema::sql_types::PgLsn)] +pub(crate) struct LsnWrapper(pub(crate) Lsn); + +impl From for LsnWrapper { + fn from(value: Lsn) -> Self { + LsnWrapper(value) + } +} + +impl FromSql for LsnWrapper { + fn from_sql( + bytes: ::RawValue<'_>, + ) -> diesel::deserialize::Result { + let byte_arr: diesel::deserialize::Result<[u8; 8]> = bytes + .as_bytes() + .try_into() + .map_err(|_| "Can't obtain lsn from sql".into()); + Ok(LsnWrapper(Lsn(u64::from_be_bytes(byte_arr?)))) + } +} + +impl ToSql for LsnWrapper { + fn to_sql<'b>( + &'b self, + out: &mut diesel::serialize::Output<'b, '_, Pg>, + ) -> diesel::serialize::Result { + out.write_all(&u64::to_be_bytes(self.0.0)) + .map(|_| IsNull::No) + .map_err(Into::into) + } +} + +#[derive(Insertable, AsChangeset, Queryable, Selectable, Clone)] +#[diesel(table_name = crate::schema::timelines)] +pub(crate) struct TimelinePersistence { + pub(crate) tenant_id: String, + pub(crate) timeline_id: String, + pub(crate) start_lsn: LsnWrapper, + pub(crate) generation: i32, + pub(crate) sk_set: Vec, + pub(crate) new_sk_set: Option>, + pub(crate) cplane_notified_generation: i32, + pub(crate) deleted_at: Option>, +} + +/// This is separate from [TimelinePersistence] only because postgres allows NULLs +/// in arrays and there is no way to forbid that at schema level. Hence diesel +/// wants `sk_set` to be `Vec>` instead of `Vec` for +/// Queryable/Selectable. It does however allow insertions without redundant +/// Option(s), so [TimelinePersistence] doesn't have them. +#[derive(Queryable, Selectable)] +#[diesel(table_name = crate::schema::timelines)] +pub(crate) struct TimelineFromDb { + pub(crate) tenant_id: String, + pub(crate) timeline_id: String, + pub(crate) start_lsn: LsnWrapper, + pub(crate) generation: i32, + pub(crate) sk_set: Vec>, + pub(crate) new_sk_set: Option>>, + pub(crate) cplane_notified_generation: i32, + pub(crate) deleted_at: Option>, +} + +impl TimelineFromDb { + fn into_persistence(self) -> TimelinePersistence { + // We should never encounter null entries in the sets, but we need to filter them out. + // There is no way to forbid this in the schema that diesel recognizes (to our knowledge). + let sk_set = self.sk_set.into_iter().flatten().collect::>(); + let new_sk_set = self + .new_sk_set + .map(|s| s.into_iter().flatten().collect::>()); + TimelinePersistence { + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, + start_lsn: self.start_lsn, + generation: self.generation, + sk_set, + new_sk_set, + cplane_notified_generation: self.cplane_notified_generation, + deleted_at: self.deleted_at, + } + } +} + +#[derive(Insertable, AsChangeset, Queryable, Selectable, Clone)] +#[diesel(table_name = crate::schema::safekeeper_timeline_pending_ops)] +pub(crate) struct TimelinePendingOpPersistence { + pub(crate) sk_id: i64, + pub(crate) tenant_id: String, + pub(crate) timeline_id: String, + pub(crate) generation: i32, + pub(crate) op_kind: SafekeeperTimelineOpKind, +} + +#[derive(Serialize, Deserialize, FromSqlRow, AsExpression, Eq, PartialEq, Debug, Copy, Clone)] +#[diesel(sql_type = diesel::sql_types::VarChar)] +pub(crate) enum SafekeeperTimelineOpKind { + Pull, + Exclude, + Delete, +} + +impl FromSql for SafekeeperTimelineOpKind { + fn from_sql( + bytes: ::RawValue<'_>, + ) -> diesel::deserialize::Result { + let bytes = bytes.as_bytes(); + match core::str::from_utf8(bytes) { + Ok(s) => match s { + "pull" => Ok(SafekeeperTimelineOpKind::Pull), + "exclude" => Ok(SafekeeperTimelineOpKind::Exclude), + "delete" => Ok(SafekeeperTimelineOpKind::Delete), + _ => Err(format!("can't parse: {s}").into()), + }, + Err(e) => Err(format!("invalid UTF-8 for op_kind: {e}").into()), + } + } +} + +impl ToSql for SafekeeperTimelineOpKind { + fn to_sql<'b>( + &'b self, + out: &mut diesel::serialize::Output<'b, '_, Pg>, + ) -> diesel::serialize::Result { + let kind_str = match self { + SafekeeperTimelineOpKind::Pull => "pull", + SafekeeperTimelineOpKind::Exclude => "exclude", + SafekeeperTimelineOpKind::Delete => "delete", + }; + out.write_all(kind_str.as_bytes()) + .map(|_| IsNull::No) + .map_err(Into::into) + } +} diff --git a/storage_controller/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs index bce1a75843..f83191038a 100644 --- a/storage_controller/src/persistence/split_state.rs +++ b/storage_controller/src/persistence/split_state.rs @@ -1,8 +1,8 @@ +use diesel::deserialize::{FromSql, FromSqlRow}; +use diesel::expression::AsExpression; use diesel::pg::{Pg, PgValue}; -use diesel::{ - deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql, - sql_types::Int2, -}; +use diesel::serialize::ToSql; +use diesel::sql_types::Int2; use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)] diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 4f0f170284..9f0b789f19 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,6 +1,8 @@ -use crate::pageserver_client::PageserverClient; -use crate::persistence::Persistence; -use crate::{compute_hook, service}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use json_structural_diff::JsonDiff; use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy}; use pageserver_api::models::{ @@ -9,10 +11,6 @@ use pageserver_api::models::{ use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_client::mgmt_api; use reqwest::StatusCode; -use std::borrow::Cow; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; use utils::backoff::exponential_backoff; use utils::generation::Generation; @@ -23,7 +21,10 @@ use utils::sync::gate::GateGuard; use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; +use crate::pageserver_client::PageserverClient; +use crate::persistence::Persistence; use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation}; +use crate::{compute_hook, service}; const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60); @@ -298,6 +299,7 @@ impl Reconciler { .await }, &self.service_config.pageserver_jwt_token, + &self.service_config.ssl_ca_cert, 1, 3, timeout, @@ -419,7 +421,8 @@ impl Reconciler { node.get_id(), node.base_url(), self.service_config.pageserver_jwt_token.as_deref(), - ); + self.service_config.ssl_ca_cert.clone(), + )?; client .wait_lsn( @@ -442,7 +445,8 @@ impl Reconciler { node.get_id(), node.base_url(), self.service_config.pageserver_jwt_token.as_deref(), - ); + self.service_config.ssl_ca_cert.clone(), + )?; let timelines = client.timeline_list(&tenant_shard_id).await?; Ok(timelines @@ -480,6 +484,7 @@ impl Reconciler { .await }, &self.service_config.pageserver_jwt_token, + &self.service_config.ssl_ca_cert, 1, 3, request_download_timeout * 2, @@ -511,7 +516,8 @@ impl Reconciler { } else if status == StatusCode::ACCEPTED { let total_runtime = started_at.elapsed(); if total_runtime > total_download_timeout { - tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", + tracing::warn!( + "Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", total_runtime.as_millis(), progress.layers_downloaded, progress.layers_total, @@ -773,6 +779,7 @@ impl Reconciler { .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, &self.service_config.pageserver_jwt_token, + &self.service_config.ssl_ca_cert, 1, 1, Duration::from_secs(5), @@ -1121,6 +1128,7 @@ impl Reconciler { .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, &self.service_config.pageserver_jwt_token, + &self.service_config.ssl_ca_cert, 1, 3, Duration::from_secs(5), diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index 53cd8a908b..2bd28f29af 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -1,16 +1,16 @@ -use std::{str::FromStr, time::Duration}; +use std::time::Duration; use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; -use reqwest::StatusCode; +use reqwest::{Certificate, StatusCode}; use safekeeper_client::mgmt_api; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::NodeId, logging::SecretString}; +use utils::backoff; +use utils::id::NodeId; +use utils::logging::SecretString; -use crate::{ - heartbeater::SafekeeperState, - persistence::{DatabaseError, SafekeeperPersistence}, - safekeeper_client::SafekeeperClient, -}; +use crate::heartbeater::SafekeeperState; +use crate::persistence::{DatabaseError, SafekeeperPersistence}; +use crate::safekeeper_client::SafekeeperClient; #[derive(Clone)] pub struct Safekeeper { @@ -18,26 +18,56 @@ pub struct Safekeeper { cancel: CancellationToken, listen_http_addr: String, listen_http_port: u16, + listen_https_port: Option, scheduling_policy: SkSchedulingPolicy, id: NodeId, + /// Heartbeating result. availability: SafekeeperState, + + // Flag from storcon's config to use https for safekeeper API. + // Invariant: if |true|, listen_https_port should contain a value. + use_https: bool, } impl Safekeeper { - pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self { - let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap(); - Self { + pub(crate) fn from_persistence( + skp: SafekeeperPersistence, + cancel: CancellationToken, + use_https: bool, + ) -> anyhow::Result { + if use_https && skp.https_port.is_none() { + anyhow::bail!( + "cannot load safekeeper {} from persistence: \ + https is enabled, but https port is not specified", + skp.id, + ); + } + + let scheduling_policy = skp.scheduling_policy.0; + Ok(Self { cancel, listen_http_addr: skp.host.clone(), listen_http_port: skp.http_port as u16, + listen_https_port: skp.https_port.map(|x| x as u16), id: NodeId(skp.id as u64), skp, availability: SafekeeperState::Offline, scheduling_policy, - } + use_https, + }) } + pub(crate) fn base_url(&self) -> String { - format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + if self.use_https { + format!( + "https://{}:{}", + self.listen_http_addr, + self.listen_https_port + .expect("https port should be specified if use_https is on"), + ) + } else { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } } pub(crate) fn get_id(&self) -> NodeId { @@ -54,13 +84,18 @@ impl Safekeeper { } pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) { self.scheduling_policy = scheduling_policy; - self.skp.scheduling_policy = String::from(scheduling_policy); + self.skp.scheduling_policy = scheduling_policy.into(); + } + pub(crate) fn availability(&self) -> SafekeeperState { + self.availability.clone() } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries + #[allow(clippy::too_many_arguments)] pub(crate) async fn with_client_retries( &self, mut op: O, jwt: &Option, + ssl_ca_cert: &Option, warn_threshold: u32, max_retries: u32, timeout: Duration, @@ -79,19 +114,22 @@ impl Safekeeper { | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, ApiError(_, _) => true, Cancelled => true, + CreateClient(_) => true, } } + // TODO: refactor SafekeeperClient and with_client_retires (#11113). + let mut http_client = reqwest::Client::builder().timeout(timeout); + if let Some(ssl_ca_cert) = ssl_ca_cert.as_ref() { + http_client = http_client.add_root_certificate(ssl_ca_cert.clone()); + } + let http_client = http_client.build().map_err(mgmt_api::Error::CreateClient)?; + backoff::retry( || { - let http_client = reqwest::ClientBuilder::new() - .timeout(timeout) - .build() - .expect("Failed to construct HTTP client"); - - let client = SafekeeperClient::from_client( + let client = SafekeeperClient::new( self.get_id(), - http_client, + http_client.clone(), self.base_url(), jwt.clone(), ); @@ -112,8 +150,9 @@ impl Safekeeper { warn_threshold, max_retries, &format!( - "Call to safekeeper {} ({}:{}) management API", - self.id, self.listen_http_addr, self.listen_http_port + "Call to safekeeper {} ({}) management API", + self.id, + self.base_url(), ), cancel, ) @@ -121,12 +160,16 @@ impl Safekeeper { .unwrap_or(Err(mgmt_api::Error::Cancelled)) } - pub(crate) fn update_from_record(&mut self, record: crate::persistence::SafekeeperUpsert) { + pub(crate) fn update_from_record( + &mut self, + record: crate::persistence::SafekeeperUpsert, + ) -> anyhow::Result<()> { let crate::persistence::SafekeeperUpsert { active: _, availability_zone_id: _, host, http_port, + https_port, id, port: _, region_id: _, @@ -139,9 +182,17 @@ impl Safekeeper { self.id.0 ); } + if self.use_https && https_port.is_none() { + anyhow::bail!( + "cannot update safekeeper {id}: \ + https is enabled, but https port is not specified" + ); + } self.skp = crate::persistence::SafekeeperPersistence::from_upsert(record, self.scheduling_policy); self.listen_http_port = http_port as u16; + self.listen_https_port = https_port.map(|x| x as u16); self.listen_http_addr = host; + Ok(()) } } diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index f234ab3429..a44fcc27d2 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -1,13 +1,12 @@ -use crate::metrics::PageserverRequestLabelGroup; use safekeeper_api::models::{ - PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, TimelineStatus, }; use safekeeper_client::mgmt_api::{Client, Result}; -use utils::{ - id::{NodeId, TenantId, TimelineId}, - logging::SecretString, -}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::logging::SecretString; + +use crate::metrics::PageserverRequestLabelGroup; /// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage /// controller to collect metrics in a non-intrusive manner. @@ -46,31 +45,18 @@ macro_rules! measured_request { } impl SafekeeperClient { - #[allow(dead_code)] pub(crate) fn new( - node_id: NodeId, - mgmt_api_endpoint: String, - jwt: Option, - ) -> Self { - Self { - inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), - node_id_label: node_id.0.to_string(), - } - } - - pub(crate) fn from_client( node_id: NodeId, raw_client: reqwest::Client, mgmt_api_endpoint: String, jwt: Option, ) -> Self { Self { - inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt), + inner: Client::new(raw_client, mgmt_api_endpoint, jwt), node_id_label: node_id.0.to_string(), } } - #[allow(dead_code)] pub(crate) async fn create_timeline( &self, req: &TimelineCreateRequest, @@ -83,12 +69,28 @@ impl SafekeeperClient { ) } - #[allow(dead_code)] + #[allow(unused)] + pub(crate) async fn exclude_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + measured_request!( + "exclude_timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .exclude_timeline(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn delete_timeline( &self, tenant_id: TenantId, timeline_id: TimelineId, - ) -> Result { + ) -> Result { measured_request!( "delete_timeline", crate::metrics::Method::Delete, @@ -97,7 +99,6 @@ impl SafekeeperClient { ) } - #[allow(dead_code)] pub(crate) async fn pull_timeline( &self, req: &PullTimelineRequest, @@ -110,6 +111,23 @@ impl SafekeeperClient { ) } + #[allow(unused)] + pub(crate) async fn bump_timeline_term( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineTermBumpRequest, + ) -> Result { + measured_request!( + "term_bump", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .bump_timeline_term(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 44936d018a..3d5f36fb98 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,11 +1,17 @@ -use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard}; +use std::collections::HashMap; +use std::fmt::Debug; + use http_utils::error::ApiError; use itertools::Itertools; -use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization}; +use pageserver_api::controller_api::AvailabilityZone; +use pageserver_api::models::PageserverUtilization; use serde::Serialize; -use std::{collections::HashMap, fmt::Debug}; use utils::id::NodeId; +use crate::metrics::NodeLabelGroup; +use crate::node::Node; +use crate::tenant_shard::TenantShard; + /// Scenarios in which we cannot find a suitable location for a tenant shard #[derive(thiserror::Error, Debug)] pub enum ScheduleError { @@ -403,13 +409,14 @@ impl ScheduleContext { } } -pub(crate) enum RefCountUpdate { +pub(crate) enum RefCountUpdate<'a> { PromoteSecondary, Attach, Detach, DemoteAttached, AddSecondary, RemoveSecondary, + ChangePreferredAzFrom(Option<&'a AvailabilityZone>), } impl Scheduler { @@ -572,6 +579,14 @@ impl Scheduler { node.home_shard_count -= 1; } } + RefCountUpdate::ChangePreferredAzFrom(old_az) => { + if Some(&node.az) == old_az { + node.home_shard_count -= 1; + } + if is_home_az { + node.home_shard_count += 1; + } + } } // Maybe update PageserverUtilization @@ -588,7 +603,8 @@ impl Scheduler { RefCountUpdate::PromoteSecondary | RefCountUpdate::Detach | RefCountUpdate::RemoveSecondary - | RefCountUpdate::DemoteAttached => { + | RefCountUpdate::DemoteAttached + | RefCountUpdate::ChangePreferredAzFrom(_) => { // De-referencing the node: leave the utilization's shard_count at a stale higher // value until some future heartbeat after we have physically removed this shard // from the node: this prevents the scheduler over-optimistically trying to schedule @@ -775,10 +791,10 @@ impl Scheduler { if !matches!(context.mode, ScheduleMode::Speculative) { tracing::info!( - "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})", - scores.iter().map(|i| i.node_id().0).collect::>(), - preferred_az, - ); + "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})", + scores.iter().map(|i| i.node_id().0).collect::>(), + preferred_az, + ); } // Note that we do not update shard count here to reflect the scheduling: that @@ -906,14 +922,14 @@ impl Scheduler { #[cfg(test)] pub(crate) mod test_utils { - use crate::node::Node; - use pageserver_api::{ - controller_api::{AvailabilityZone, NodeAvailability}, - models::utilization::test_utilization, - }; use std::collections::HashMap; + + use pageserver_api::controller_api::{AvailabilityZone, NodeAvailability}; + use pageserver_api::models::utilization::test_utilization; use utils::id::NodeId; + use crate::node::Node; + /// Test helper: synthesize the requested number of nodes, all in active state. /// /// Node IDs start at one. @@ -951,17 +967,13 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { - use pageserver_api::{ - controller_api::NodeAvailability, models::utilization::test_utilization, - shard::ShardIdentity, - }; - use utils::{ - id::TenantId, - shard::{ShardCount, ShardNumber, TenantShardId}, - }; + use pageserver_api::controller_api::NodeAvailability; + use pageserver_api::models::utilization::test_utilization; + use pageserver_api::shard::ShardIdentity; + use utils::id::TenantId; + use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use super::*; - use crate::tenant_shard::IntentState; #[test] fn scheduler_basic() -> anyhow::Result<()> { @@ -1533,4 +1545,67 @@ mod tests { shard.intent.clear(&mut scheduler); } } + + #[test] + fn change_preferred_az() { + let az_a = AvailabilityZone("az-a".to_string()); + let az_b = AvailabilityZone("az-b".to_string()); + + // 2 nodes: 1 az_a and 1 az_b. + let nodes = test_utils::make_test_nodes(2, &[az_a.clone(), az_b.clone()]); + let mut scheduler = Scheduler::new(nodes.values()); + + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::generate(), + shard_number: ShardNumber(0), + shard_count: ShardCount(1), + }; + let shard_identity = ShardIdentity::new( + tenant_shard_id.shard_number, + tenant_shard_id.shard_count, + pageserver_api::shard::ShardStripeSize(1), + ) + .unwrap(); + // 1 attached and 1 secondary. + let mut shard = TenantShard::new( + tenant_shard_id, + shard_identity, + pageserver_api::controller_api::PlacementPolicy::Attached(1), + Some(az_a.clone()), + ); + + let mut context = ScheduleContext::default(); + shard.schedule(&mut scheduler, &mut context).unwrap(); + eprintln!("Scheduled shard at {:?}", shard.intent); + + for node in scheduler.nodes.values() { + // Only 2 nodes, one tenant shard should be scheduled on each of them. + assert_eq!(node.shard_count, 1); + if node.az == az_a { + assert_eq!(node.home_shard_count, 1); + } else { + assert_eq!(node.home_shard_count, 0); + } + } + + shard.set_preferred_az(&mut scheduler, Some(az_b.clone())); + // Home AZ flipped. + for node in scheduler.nodes.values() { + assert_eq!(node.shard_count, 1); + if node.az == az_a { + assert_eq!(node.home_shard_count, 0); + } else { + assert_eq!(node.home_shard_count, 1); + } + } + + shard.set_preferred_az(&mut scheduler, None); + // No home AZ. + for node in scheduler.nodes.values() { + assert_eq!(node.shard_count, 1); + assert_eq!(node.home_shard_count, 0); + } + + shard.intent.clear(&mut scheduler); + } } diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 361253bd19..9b36376fcb 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -1,5 +1,11 @@ // @generated automatically by Diesel CLI. +pub mod sql_types { + #[derive(diesel::query_builder::QueryId, diesel::sql_types::SqlType)] + #[diesel(postgres_type(name = "pg_lsn", schema = "pg_catalog"))] + pub struct PgLsn; +} + diesel::table! { controllers (address, started_at) { address -> Varchar, @@ -30,6 +36,16 @@ diesel::table! { } } +diesel::table! { + safekeeper_timeline_pending_ops (tenant_id, timeline_id, sk_id) { + sk_id -> Int8, + tenant_id -> Varchar, + timeline_id -> Varchar, + generation -> Int4, + op_kind -> Varchar, + } +} + diesel::table! { safekeepers (id) { id -> Int8, @@ -40,6 +56,7 @@ diesel::table! { http_port -> Int4, availability_zone_id -> Text, scheduling_policy -> Varchar, + https_port -> Nullable, } } @@ -59,10 +76,28 @@ diesel::table! { } } +diesel::table! { + use diesel::sql_types::*; + use super::sql_types::PgLsn; + + timelines (tenant_id, timeline_id) { + tenant_id -> Varchar, + timeline_id -> Varchar, + start_lsn -> PgLsn, + generation -> Int4, + sk_set -> Array>, + new_sk_set -> Nullable>>, + cplane_notified_generation -> Int4, + deleted_at -> Nullable, + } +} + diesel::allow_tables_to_appear_in_same_query!( controllers, metadata_health, nodes, + safekeeper_timeline_pending_ops, safekeepers, tenant_shards, + timelines, ); diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index b9c2711192..4e00136e1b 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,112 +1,104 @@ pub mod chaos_injector; mod context_iterator; +pub(crate) mod safekeeper_reconciler; -use hyper::Uri; -use safekeeper_api::models::SafekeeperUtilization; -use std::{ - borrow::Cow, - cmp::Ordering, - collections::{BTreeMap, HashMap, HashSet}, - error::Error, - ops::Deref, - path::PathBuf, - str::FromStr, - sync::Arc, - time::{Duration, Instant}, -}; +use std::borrow::Cow; +use std::cmp::Ordering; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::error::Error; +use std::num::NonZeroU32; +use std::ops::{Deref, DerefMut}; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant}; -use crate::{ - background_node_operations::{ - Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, - }, - compute_hook::{self, NotifyError}, - drain_utils::{self, TenantShardDrain, TenantShardIterator}, - heartbeater::SafekeeperState, - id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, - leadership::Leadership, - metrics, - peer_client::GlobalObservedState, - persistence::{ - AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, - ShardGenerationState, TenantFilter, - }, - reconciler::{ - ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, - ReconcilerPriority, - }, - safekeeper::Safekeeper, - scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, - tenant_shard::{ - MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus, - ScheduleOptimization, ScheduleOptimizationAction, - }, -}; use anyhow::Context; +use context_iterator::TenantShardContextIterator; use control_plane::storage_controller::{ AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, }; use diesel::result::DatabaseErrorKind; -use futures::{stream::FuturesUnordered, StreamExt}; -use itertools::Itertools; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, - NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, - SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, - ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, - TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, - TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, - TenantShardMigrateResponse, - }, - models::{ - SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest, - TimelineArchivalConfigRequest, TopTenantShardsRequest, - }, -}; -use reqwest::StatusCode; -use tracing::{instrument, Instrument}; - -use crate::pageserver_client::PageserverClient; +use futures::StreamExt; +use futures::stream::FuturesUnordered; use http_utils::error::ApiError; -use pageserver_api::{ - models::{ - self, LocationConfig, LocationConfigListResponse, LocationConfigMode, - PageserverUtilization, ShardParameters, TenantConfig, TenantLocationConfigRequest, - TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, - TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo, - }, - shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, - upcall_api::{ - ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, - ValidateResponse, ValidateResponseTenant, - }, +use hyper::Uri; +use itertools::Itertools; +use pageserver_api::controller_api::{ + AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, + NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, + SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, + ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, + TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, + TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, + TenantShardMigrateResponse, }; -use pageserver_client::{mgmt_api, BlockUnblock}; -use tokio::sync::{mpsc::error::TrySendError, TryAcquireError}; +use pageserver_api::models::{ + self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, + PageserverUtilization, SafekeeperInfo, SafekeepersInfo, SecondaryProgress, ShardParameters, + TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, + TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, + TenantShardSplitResponse, TenantSorting, TenantTimeTravelRequest, + TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateResponseStorcon, + TimelineInfo, TopTenantShardItem, TopTenantShardsRequest, +}; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, +}; +use pageserver_api::upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, + ValidateResponseTenant, +}; +use pageserver_client::{BlockUnblock, mgmt_api}; +use reqwest::{Certificate, StatusCode}; +use safekeeper_api::membership::{MemberSet, SafekeeperId}; +use safekeeper_api::models::SafekeeperUtilization; +use safekeeper_reconciler::{SafekeeperReconcilers, ScheduleRequest}; +use tokio::sync::TryAcquireError; +use tokio::sync::mpsc::error::TrySendError; +use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use utils::{ - completion::Barrier, - failpoint_support, - generation::Generation, - id::{NodeId, TenantId, TimelineId}, - pausable_failpoint, - sync::gate::Gate, -}; +use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; +use utils::completion::Barrier; +use utils::generation::Generation; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::logging::SecretString; +use utils::sync::gate::Gate; +use utils::{failpoint_support, pausable_failpoint}; -use crate::{ - compute_hook::ComputeHook, - heartbeater::{Heartbeater, PageserverState}, - node::{AvailabilityTransition, Node}, - persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, - reconciler::attached_location_conf, - scheduler::Scheduler, - tenant_shard::{ - IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, - ReconcilerWaiter, TenantShard, - }, +use crate::background_node_operations::{ + Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler, +}; +use crate::compute_hook::{self, ComputeHook, NotifyError}; +use crate::drain_utils::{self, TenantShardDrain, TenantShardIterator}; +use crate::heartbeater::{Heartbeater, PageserverState, SafekeeperState}; +use crate::id_lock_map::{ + IdLockMap, TracingExclusiveGuard, trace_exclusive_lock, trace_shared_lock, +}; +use crate::leadership::Leadership; +use crate::metrics; +use crate::node::{AvailabilityTransition, Node}; +use crate::pageserver_client::PageserverClient; +use crate::peer_client::GlobalObservedState; +use crate::persistence::split_state::SplitState; +use crate::persistence::{ + AbortShardSplitStatus, ControllerPersistence, DatabaseError, DatabaseResult, + MetadataHealthPersistence, Persistence, SafekeeperTimelineOpKind, ShardGenerationState, + TenantFilter, TenantShardPersistence, TimelinePendingOpPersistence, TimelinePersistence, +}; +use crate::reconciler::{ + ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, ReconcilerPriority, + attached_location_conf, +}; +use crate::safekeeper::Safekeeper; +use crate::scheduler::{ + AttachedShardTag, MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler, +}; +use crate::tenant_shard::{ + IntentState, MigrateAttachment, ObservedState, ObservedStateDelta, ObservedStateLocation, + ReconcileNeeded, ReconcileResult, ReconcileWaitError, ReconcilerStatus, ReconcilerWaiter, + ScheduleOptimization, ScheduleOptimizationAction, TenantShard, }; - -use context_iterator::TenantShardContextIterator; const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500); @@ -216,6 +208,8 @@ struct ServiceState { safekeepers: Arc>, + safekeeper_reconcilers: SafekeeperReconcilers, + scheduler: Scheduler, /// Ongoing background operation on the cluster if any is running. @@ -276,6 +270,7 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { ApiError::Conflict(format!("{node} {status}: {status} {msg}")) } mgmt_api::Error::Cancelled => ApiError::ShuttingDown, + mgmt_api::Error::CreateClient(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), } } @@ -287,6 +282,7 @@ impl ServiceState { scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, initial_leadership_status: LeadershipStatus, + reconcilers_cancel: CancellationToken, ) -> Self { metrics::update_leadership_status(initial_leadership_status); @@ -295,6 +291,7 @@ impl ServiceState { tenants, nodes: Arc::new(nodes), safekeepers: Arc::new(safekeepers), + safekeeper_reconcilers: SafekeeperReconcilers::new(reconcilers_cancel), scheduler, ongoing_operation: None, delayed_reconcile_rx, @@ -366,6 +363,15 @@ pub struct Config { /// assume it is running in a test environment and try to update neon_local. pub compute_hook_url: Option, + /// Prefix for storage API endpoints of the control plane. We use this prefix to compute + /// URLs that we use to send pageserver and safekeeper attachment locations. + /// If this is None, the compute hook will assume it is running in a test environment + /// and try to invoke neon_local instead. + /// + /// For now, there is also `compute_hook_url` which allows configuration of the pageserver + /// specific endpoint, but it is in the process of being phased out. + pub control_plane_url: Option, + /// Grace period within which a pageserver does not respond to heartbeats, but is still /// considered active. Once the grace period elapses, the next heartbeat failure will /// mark the pagseserver offline. @@ -382,8 +388,16 @@ pub struct Config { /// How many high-priority Reconcilers may be spawned concurrently pub priority_reconciler_concurrency: usize, - /// How large must a shard grow in bytes before we split it? - /// None disables auto-splitting. + /// How many API requests per second to allow per tenant, across all + /// tenant-scoped API endpoints. Further API requests queue until ready. + pub tenant_rate_limit: NonZeroU32, + + /// The size at which an unsharded tenant should be split (into 8 shards). This uses the logical + /// size of the largest timeline in the shard (i.e. max_logical_size). + /// + /// None or 0 disables auto-splitting. + /// + /// TODO: consider using total logical size of all timelines instead. pub split_threshold: Option, // TODO: make this cfg(feature = "testing") @@ -406,6 +420,12 @@ pub struct Config { pub long_reconcile_threshold: Duration, pub use_https_pageserver_api: bool, + + pub use_https_safekeeper_api: bool, + + pub ssl_ca_cert: Option, + + pub timelines_onto_safekeepers: bool, } impl From for ApiError { @@ -744,7 +764,27 @@ impl Service { std::process::exit(1); } - self.inner.write().unwrap().become_leader(); + let safekeepers = self.inner.read().unwrap().safekeepers.clone(); + let sk_schedule_requests = + match safekeeper_reconciler::load_schedule_requests(self, &safekeepers).await { + Ok(v) => v, + Err(e) => { + tracing::warn!( + "Failed to load safekeeper pending ops at startup: {e}." // Don't abort for now: " Aborting start-up..." + ); + // std::process::exit(1); + Vec::new() + } + }; + + { + let mut locked = self.inner.write().unwrap(); + locked.become_leader(); + + locked + .safekeeper_reconcilers + .schedule_request_vec(self, sk_schedule_requests); + } // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that // generation_pageserver in the database. @@ -787,7 +827,9 @@ impl Service { }); } - tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); + tracing::info!( + "Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)" + ); } async fn initial_heartbeat_round<'a>( @@ -888,6 +930,7 @@ impl Service { .with_client_retries( |client| async move { client.list_location_config().await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 5, timeout, @@ -985,11 +1028,20 @@ impl Service { break; } - let client = PageserverClient::new( + let client = match PageserverClient::new( node.get_id(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - ); + self.config.ssl_ca_cert.clone(), + ) { + Ok(client) => client, + Err(e) => { + tracing::error!( + "Failed to create client to detach unknown shard {tenant_shard_id} on pageserver {node_id}: {e}" + ); + continue; + } + }; match client .location_config( tenant_shard_id, @@ -1016,7 +1068,7 @@ impl Service { // Non-fatal error: leaving a tenant shard behind that we are not managing shouldn't // break anything. tracing::error!( - "Failed to detach unknkown shard {tenant_shard_id} on pageserver {node_id}: {e}" + "Failed to detach unknown shard {tenant_shard_id} on pageserver {node_id}: {e}" ); } } @@ -1049,6 +1101,7 @@ impl Service { } } } + /// Heartbeat all storage nodes once in a while. #[instrument(skip_all)] async fn spawn_heartbeat_driver(&self) { self.startup_complete.clone().wait().await; @@ -1182,7 +1235,9 @@ impl Service { let mut safekeepers = (*locked.safekeepers).clone(); for (id, state) in deltas.0 { let Some(sk) = safekeepers.get_mut(&id) else { - tracing::info!("Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}"); + tracing::info!( + "Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}" + ); continue; }; sk.set_availability(state); @@ -1422,8 +1477,14 @@ impl Service { .list_safekeepers() .await? .into_iter() - .map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new())) - .collect::>(); + .map(|skp| { + Safekeeper::from_persistence( + skp, + CancellationToken::new(), + config.use_https_safekeeper_api, + ) + }) + .collect::>>()?; let safekeepers: HashMap = safekeepers.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} safekeepers from database.", safekeepers.len()); @@ -1537,7 +1598,9 @@ impl Service { // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations // on different pageservers. - tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled"); + tracing::warn!( + "Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled" + ); } } let new_tenant = TenantShard::from_persistent(tsp, intent)?; @@ -1559,6 +1622,7 @@ impl Service { let heartbeater_ps = Heartbeater::new( config.pageserver_jwt_token.clone(), + config.ssl_ca_cert.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), @@ -1566,6 +1630,7 @@ impl Service { let heartbeater_sk = Heartbeater::new( config.safekeeper_jwt_token.clone(), + config.ssl_ca_cert.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), @@ -1585,6 +1650,7 @@ impl Service { scheduler, delayed_reconcile_rx, initial_leadership_status, + reconcilers_cancel.clone(), ))), config: config.clone(), persistence, @@ -1867,7 +1933,7 @@ impl Service { } Ok(AttachHookResponse { - gen: attach_req + generation: attach_req .node_id .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), }) @@ -1913,6 +1979,7 @@ impl Service { .with_client_retries( |client| async move { client.list_location_config().await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -1937,21 +2004,41 @@ impl Service { tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len()); let mut cleanup = Vec::new(); + let mut mismatched_locations = 0; { let mut locked = self.inner.write().unwrap(); - for (tenant_shard_id, observed_loc) in configs.tenant_shards { + for (tenant_shard_id, reported) in configs.tenant_shards { let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { cleanup.push(tenant_shard_id); continue; }; - tenant_shard + + let on_record = &mut tenant_shard .observed .locations - .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); + .entry(node.get_id()) + .or_insert_with(|| ObservedStateLocation { conf: None }) + .conf; + + // If the location reported by the node does not match our observed state, + // then we mark it as uncertain and let the background reconciliation loop + // deal with it. + // + // Note that this also covers net new locations reported by the node. + if *on_record != reported { + mismatched_locations += 1; + *on_record = None; + } } } + if mismatched_locations > 0 { + tracing::info!( + "Set observed state to None for {mismatched_locations} mismatched locations" + ); + } + for tenant_shard_id in cleanup { tracing::info!("Detaching {tenant_shard_id}"); match node @@ -1971,6 +2058,7 @@ impl Service { .await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -2039,7 +2127,7 @@ impl Service { let new_gen = *new_gen; response.tenants.push(ReAttachResponseTenant { id: *tenant_shard_id, - gen: Some(new_gen.into().unwrap()), + r#gen: Some(new_gen.into().unwrap()), // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`] // execution. If a pageserver is restarted during that process, then the reconcile pass will // fail, and start from scratch, so it doesn't make sense for us to try and preserve @@ -2076,7 +2164,7 @@ impl Service { response.tenants.push(ReAttachResponseTenant { id: *tenant_shard_id, - gen: None, + r#gen: None, mode: LocationConfigMode::Secondary, }); @@ -2138,15 +2226,19 @@ impl Service { let locked = self.inner.read().unwrap(); for req_tenant in validate_req.tenants { if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.r#gen)); tracing::info!( "handle_validate: {}(gen {}): valid={valid} (latest {:?})", req_tenant.id, - req_tenant.gen, + req_tenant.r#gen, tenant_shard.generation ); - in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid)); + in_memory_result.push(( + req_tenant.id, + Generation::new(req_tenant.r#gen), + valid, + )); } else { // This is legal: for example during a shard split the pageserver may still // have deletions in its queue from the old pre-split shard, or after deletion @@ -2165,13 +2257,11 @@ impl Service { // in case of controller split-brain, where some other controller process might have incremented the generation. let db_generations = self .persistence - .shard_generations(in_memory_result.iter().filter_map(|i| { - if i.2 { - Some(&i.0) - } else { - None - } - })) + .shard_generations( + in_memory_result + .iter() + .filter_map(|i| if i.2 { Some(&i.0) } else { None }), + ) .await?; let db_generations = db_generations.into_iter().collect::>(); @@ -2323,7 +2413,9 @@ impl Service { // Unique key violation: this is probably a retry. Because the shard count is part of the unique key, // if we see a unique key violation it means that the creation request's shard count matches the previous // creation's shard count. - tracing::info!("Tenant shards already present in database, proceeding with idempotent creation..."); + tracing::info!( + "Tenant shards already present in database, proceeding with idempotent creation..." + ); } // Any other database error is unexpected and a bug. Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))), @@ -3004,7 +3096,7 @@ impl Service { None => { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), - )) + )); } } }; @@ -3071,7 +3163,9 @@ impl Service { }) .find(|(_, _, mode)| *mode != LocationConfigMode::Detached); if let Some((node_id, _observed_location, mode)) = maybe_attached { - return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}"))); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}" + ))); } } let scheduler = &mut locked.scheduler; @@ -3108,7 +3202,9 @@ impl Service { node.get_id(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - ); + self.config.ssl_ca_cert.clone(), + ) + .map_err(|e| passthrough_api_error(&node, e))?; tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); @@ -3169,7 +3265,9 @@ impl Service { node.get_id(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - ); + self.config.ssl_ca_cert.clone(), + ) + .map_err(|e| passthrough_api_error(&node, e))?; futs.push(async move { let result = client .tenant_secondary_download(tenant_shard_id, wait) @@ -3292,6 +3390,7 @@ impl Service { .await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 3, RECONCILE_TIMEOUT, @@ -3413,7 +3512,7 @@ impl Service { Ok(()) } - pub(crate) async fn tenant_timeline_create( + pub(crate) async fn tenant_timeline_create_pageservers( &self, tenant_id: TenantId, mut create_req: TimelineCreateRequest, @@ -3424,14 +3523,6 @@ impl Service { create_req.new_timeline_id, ); - let _tenant_lock = trace_shared_lock( - &self.tenant_op_locks, - tenant_id, - TenantOperations::TimelineCreate, - ) - .await; - failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock"); - self.tenant_remote_mutation(tenant_id, move |mut targets| async move { if targets.0.is_empty() { return Err(ApiError::NotFound( @@ -3447,6 +3538,7 @@ impl Service { tenant_shard_id: TenantShardId, locations: ShardMutationLocations, jwt: Option, + ssl_ca_cert: Option, create_req: TimelineCreateRequest, ) -> Result { let latest = locations.latest.node; @@ -3459,7 +3551,8 @@ impl Service { ); let client = - PageserverClient::new(latest.get_id(), latest.base_url(), jwt.as_deref()); + PageserverClient::new(latest.get_id(), latest.base_url(), jwt.as_deref(), ssl_ca_cert.clone()) + .map_err(|e| passthrough_api_error(&latest, e))?; let timeline_info = client .timeline_create(tenant_shard_id, &create_req) @@ -3482,7 +3575,9 @@ impl Service { location.node.get_id(), location.node.base_url(), jwt.as_deref(), - ); + ssl_ca_cert.clone(), + ) + .map_err(|e| passthrough_api_error(&location.node, e))?; let res = client .timeline_create(tenant_shard_id, &create_req) @@ -3511,6 +3606,7 @@ impl Service { shard_zero_tid, shard_zero_locations, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), create_req.clone(), ) .await?; @@ -3540,6 +3636,7 @@ impl Service { tenant_shard_id, mutation_locations, jwt.clone(), + self.config.ssl_ca_cert.clone(), create_req, )) }, @@ -3552,6 +3649,323 @@ impl Service { .await? } + /// Timeline creation on safekeepers + /// + /// Returns `Ok(left)` if the timeline has been created on a quorum of safekeepers, + /// where `left` contains the list of safekeepers that didn't have a successful response. + /// Assumes tenant lock is held while calling this function. + async fn tenant_timeline_create_safekeepers_quorum( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: u32, + timeline_persistence: &TimelinePersistence, + ) -> Result, ApiError> { + // If quorum is reached, return if we are outside of a specified timeout + let jwt = self + .config + .safekeeper_jwt_token + .clone() + .map(SecretString::from); + let mut joinset = JoinSet::new(); + + let safekeepers = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + + let mut members = Vec::new(); + for sk_id in timeline_persistence.sk_set.iter() { + let sk_id = NodeId(*sk_id as u64); + let Some(safekeeper) = safekeepers.get(&sk_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "couldn't find entry for safekeeper with id {sk_id}" + )))?; + }; + members.push(SafekeeperId { + id: sk_id, + host: safekeeper.skp.host.clone(), + pg_port: safekeeper.skp.port as u16, + }); + } + let mset = MemberSet::new(members).map_err(ApiError::InternalServerError)?; + let mconf = safekeeper_api::membership::Configuration::new(mset); + + let req = safekeeper_api::models::TimelineCreateRequest { + commit_lsn: None, + mconf, + pg_version, + start_lsn: timeline_persistence.start_lsn.0, + system_id: None, + tenant_id, + timeline_id, + wal_seg_size: None, + }; + const SK_CREATE_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + for sk in timeline_persistence.sk_set.iter() { + let sk_id = NodeId(*sk as u64); + let safekeepers = safekeepers.clone(); + let jwt = jwt.clone(); + let ssl_ca_cert = self.config.ssl_ca_cert.clone(); + let req = req.clone(); + joinset.spawn(async move { + // Unwrap is fine as we already would have returned error above + let sk_p = safekeepers.get(&sk_id).unwrap(); + let res = sk_p + .with_client_retries( + |client| { + let req = req.clone(); + async move { client.create_timeline(&req).await } + }, + &jwt, + &ssl_ca_cert, + 3, + 3, + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, + &CancellationToken::new(), + ) + .await; + (sk_id, sk_p.skp.host.clone(), res) + }); + } + // After we have built the joinset, we now wait for the tasks to complete, + // but with a specified timeout to make sure we return swiftly, either with + // a failure or success. + let reconcile_deadline = tokio::time::Instant::now() + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT; + + // Wait until all tasks finish or timeout is hit, whichever occurs + // first. + let mut reconcile_results = Vec::new(); + loop { + if let Ok(res) = tokio::time::timeout_at(reconcile_deadline, joinset.join_next()).await + { + let Some(res) = res else { break }; + match res { + Ok(res) => { + tracing::info!( + "response from safekeeper id:{} at {}: {:?}", + res.0, + res.1, + res.2 + ); + reconcile_results.push(res); + } + Err(join_err) => { + tracing::info!("join_err for task in joinset: {join_err}"); + } + } + } else { + tracing::info!( + "timeout for creation call after {} responses", + reconcile_results.len() + ); + break; + } + } + + // Now check now if quorum was reached in reconcile_results. + let total_result_count = reconcile_results.len(); + let remaining = reconcile_results + .into_iter() + .filter_map(|res| res.2.is_err().then_some(res.0)) + .collect::>(); + tracing::info!( + "Got {} non-successful responses from initial creation request of total {total_result_count} responses", + remaining.len() + ); + if remaining.len() >= 2 { + // Failure + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "not enough successful reconciliations to reach quorum, please retry: {} errored", + remaining.len() + ))); + } + + Ok(remaining) + } + + /// Create timeline in controller database and on safekeepers. + /// `timeline_info` is result of timeline creation on pageserver. + /// + /// All actions must be idempotent as the call is retried until success. It + /// tries to create timeline in the db and on at least majority of + /// safekeepers + queue creation for safekeepers which missed it in the db + /// for infinite retries; after that, call returns Ok. + /// + /// The idea is that once this is reached as long as we have alive majority + /// of safekeepers it is expected to get eventually operational as storcon + /// will be able to seed timeline on nodes which missed creation by making + /// pull_timeline from peers. On the other hand we don't want to fail + /// timeline creation if one safekeeper is down. + async fn tenant_timeline_create_safekeepers( + self: &Arc, + tenant_id: TenantId, + timeline_info: &TimelineInfo, + create_mode: models::TimelineCreateRequestMode, + ) -> Result { + let timeline_id = timeline_info.timeline_id; + let pg_version = timeline_info.pg_version; + // Initially start_lsn is determined by last_record_lsn in pageserver + // response as it does initdb. However, later we persist it and in sk + // creation calls replace with the value from the timeline row if it + // previously existed as on retries in theory endpoint might have + // already written some data and advanced last_record_lsn, while we want + // safekeepers to have consistent start_lsn. + let start_lsn = match create_mode { + models::TimelineCreateRequestMode::Bootstrap { .. } => timeline_info.last_record_lsn, + models::TimelineCreateRequestMode::Branch { .. } => timeline_info.last_record_lsn, + models::TimelineCreateRequestMode::ImportPgdata { .. } => { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "import pgdata doesn't specify the start lsn, aborting creation on safekeepers" + )))?; + } + }; + // Choose initial set of safekeepers respecting affinity + let sks = self.safekeepers_for_new_timeline().await?; + let sks_persistence = sks.iter().map(|sk| sk.id.0 as i64).collect::>(); + // Add timeline to db + let mut timeline_persist = TimelinePersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + start_lsn: start_lsn.into(), + generation: 0, + sk_set: sks_persistence.clone(), + new_sk_set: None, + cplane_notified_generation: 0, + deleted_at: None, + }; + let inserted = self + .persistence + .insert_timeline(timeline_persist.clone()) + .await?; + if !inserted { + if let Some(existent_persist) = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await? + { + // Replace with what we have in the db, to get stuff like the generation right. + // We do still repeat the http calls to the safekeepers. After all, we could have + // crashed right after the wrote to the DB. + timeline_persist = existent_persist; + } else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "insertion said timeline already in db, but looking it up, it was gone" + ))); + } + } + // Create the timeline on a quorum of safekeepers + let remaining = self + .tenant_timeline_create_safekeepers_quorum( + tenant_id, + timeline_id, + pg_version, + &timeline_persist, + ) + .await?; + + // For the remaining safekeepers, take care of their reconciliation asynchronously + for &remaining_id in remaining.iter() { + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: timeline_persist.generation, + op_kind: crate::persistence::SafekeeperTimelineOpKind::Pull, + sk_id: remaining_id.0 as i64, + }; + tracing::info!("writing pending op for sk id {remaining_id}"); + self.persistence.insert_pending_op(pending_op).await?; + } + if !remaining.is_empty() { + let mut locked = self.inner.write().unwrap(); + for remaining_id in remaining { + let Some(sk) = locked.safekeepers.get(&remaining_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {remaining_id}" + ))); + }; + let Ok(host_list) = sks + .iter() + .map(|sk| { + Ok(( + sk.id, + locked + .safekeepers + .get(&sk.id) + .ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {remaining_id} to pull from" + )) + })? + .base_url(), + )) + }) + .collect::>() + else { + continue; + }; + let req = ScheduleRequest { + safekeeper: Box::new(sk.clone()), + host_list, + tenant_id, + timeline_id, + generation: timeline_persist.generation as u32, + kind: crate::persistence::SafekeeperTimelineOpKind::Pull, + }; + locked.safekeeper_reconcilers.schedule_request(self, req); + } + } + + Ok(SafekeepersInfo { + generation: timeline_persist.generation as u32, + safekeepers: sks, + tenant_id, + timeline_id, + }) + } + + pub(crate) async fn tenant_timeline_create( + self: &Arc, + tenant_id: TenantId, + create_req: TimelineCreateRequest, + ) -> Result { + let safekeepers = self.config.timelines_onto_safekeepers; + tracing::info!( + %safekeepers, + "Creating timeline {}/{}", + tenant_id, + create_req.new_timeline_id, + ); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineCreate, + ) + .await; + failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock"); + let create_mode = create_req.mode.clone(); + + let timeline_info = self + .tenant_timeline_create_pageservers(tenant_id, create_req) + .await?; + + let safekeepers = if safekeepers { + let res = self + .tenant_timeline_create_safekeepers(tenant_id, &timeline_info, create_mode) + .instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id)) + .await?; + Some(res) + } else { + None + }; + + Ok(TimelineCreateResponseStorcon { + timeline_info, + safekeepers, + }) + } + pub(crate) async fn tenant_timeline_archival_config( &self, tenant_id: TenantId, @@ -3581,13 +3995,15 @@ impl Service { timeline_id: TimelineId, node: Node, jwt: Option, + ssl_ca_cert: Option, req: TimelineArchivalConfigRequest, ) -> Result<(), ApiError> { tracing::info!( "Setting archival config of timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref(), ssl_ca_cert) + .map_err(|e| passthrough_api_error(&node, e))?; client .timeline_archival_config(tenant_shard_id, timeline_id, &req) @@ -3610,6 +4026,7 @@ impl Service { timeline_id, node, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), req.clone(), )) }) @@ -3624,6 +4041,7 @@ impl Service { &self, tenant_id: TenantId, timeline_id: TimelineId, + behavior: Option, ) -> Result { tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",); @@ -3646,15 +4064,18 @@ impl Service { timeline_id: TimelineId, node: Node, jwt: Option, + ssl_ca_cert: Option, + behavior: Option, ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { tracing::info!( "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref(), ssl_ca_cert) + .map_err(|e| passthrough_api_error(&node, e))?; client - .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .timeline_detach_ancestor(tenant_shard_id, timeline_id, behavior) .await .map_err(|e| { use mgmt_api::Error; @@ -3691,6 +4112,8 @@ impl Service { timeline_id, node, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), + behavior, )) }) .await?; @@ -3743,9 +4166,16 @@ impl Service { timeline_id: TimelineId, node: Node, jwt: Option, + ssl_ca_cert: Option, dir: BlockUnblock, ) -> Result<(), ApiError> { - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + jwt.as_deref(), + ssl_ca_cert, + ) + .map_err(|e| passthrough_api_error(&node, e))?; client .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir) @@ -3765,6 +4195,7 @@ impl Service { timeline_id, node, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), dir, )) }) @@ -3779,6 +4210,7 @@ impl Service { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<(), ApiError> { let _tenant_lock = trace_shared_lock( &self.tenant_op_locks, @@ -3816,7 +4248,12 @@ impl Service { targets, |tenant_shard_id, client| async move { client - .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .timeline_download_heatmap_layers( + tenant_shard_id, + timeline_id, + concurrency, + recurse, + ) .await }, 1, @@ -3831,7 +4268,8 @@ impl Service { /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// - /// On success, the returned vector contains exactly the same number of elements as the input `locations`. + /// On success, the returned vector contains exactly the same number of elements as the input `locations` + /// and returned element at index `i` is the result for `req_fn(op(locations[i])`. async fn tenant_for_shards( &self, locations: Vec<(TenantShardId, Node)>, @@ -3847,18 +4285,23 @@ impl Service { let mut futs = FuturesUnordered::new(); let mut results = Vec::with_capacity(locations.len()); - for (tenant_shard_id, node) in locations { - futs.push(req_fn(tenant_shard_id, node)); + for (idx, (tenant_shard_id, node)) in locations.into_iter().enumerate() { + let fut = req_fn(tenant_shard_id, node); + futs.push(async move { (idx, fut.await) }); } - while let Some(r) = futs.next().await { - results.push(r?); + while let Some((idx, r)) = futs.next().await { + results.push((idx, r?)); } - Ok(results) + results.sort_by_key(|(idx, _)| *idx); + Ok(results.into_iter().map(|(_, r)| r).collect()) } - /// Concurrently invoke a pageserver API call on many shards at once + /// Concurrently invoke a pageserver API call on many shards at once. + /// + /// The returned Vec has the same length as the `locations` Vec, + /// and returned element at index `i` is the result for `op(locations[i])`. pub(crate) async fn tenant_for_shards_api( &self, locations: Vec<(TenantShardId, Node)>, @@ -3875,26 +4318,29 @@ impl Service { let mut futs = FuturesUnordered::new(); let mut results = Vec::with_capacity(locations.len()); - for (tenant_shard_id, node) in locations { + for (idx, (tenant_shard_id, node)) in locations.into_iter().enumerate() { futs.push(async move { - node.with_client_retries( - |client| op(tenant_shard_id, client), - &self.config.pageserver_jwt_token, - warn_threshold, - max_retries, - timeout, - cancel, - ) - .await + let r = node + .with_client_retries( + |client| op(tenant_shard_id, client), + &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, + warn_threshold, + max_retries, + timeout, + cancel, + ) + .await; + (idx, r) }); } - while let Some(r) = futs.next().await { - let r = r.unwrap_or(Err(mgmt_api::Error::Cancelled)); - results.push(r); + while let Some((idx, r)) = futs.next().await { + results.push((idx, r.unwrap_or(Err(mgmt_api::Error::Cancelled)))); } - results + results.sort_by_key(|(idx, _)| *idx); + results.into_iter().map(|(_, r)| r).collect() } /// Helper for safely working with the shards in a tenant remotely on pageservers, for example @@ -3944,7 +4390,9 @@ impl Service { // This can only happen if there is a split brain controller modifying the database. This should // never happen when testing, and if it happens in production we can only log the issue. debug_assert!(false); - tracing::error!("Shard {shard_id} not found in generation state! Is another rogue controller running?"); + tracing::error!( + "Shard {shard_id} not found in generation state! Is another rogue controller running?" + ); continue; }; let (generation, generation_pageserver) = generation; @@ -3953,13 +4401,17 @@ impl Service { // This is legitimate only in a very narrow window where the shard was only just configured into // Attached mode after being created in Secondary or Detached mode, and it has had its generation // set but not yet had a Reconciler run (reconciler is the only thing that sets generation_pageserver). - tracing::warn!("Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?"); + tracing::warn!( + "Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?" + ); } } else { // This should never happen: a shard with no generation is only permitted when it was created in some state // other than PlacementPolicy::Attached (and generation is always written to DB before setting Attached in memory) debug_assert!(false); - tracing::error!("Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!"); + tracing::error!( + "Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!" + ); continue; } } @@ -4070,7 +4522,7 @@ impl Service { } pub(crate) async fn tenant_timeline_delete( - &self, + self: &Arc, tenant_id: TenantId, timeline_id: TimelineId, ) -> Result { @@ -4082,7 +4534,7 @@ impl Service { ) .await; - self.tenant_remote_mutation(tenant_id, move |mut targets| async move { + let status_code = self.tenant_remote_mutation(tenant_id, move |mut targets| async move { if targets.0.is_empty() { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), @@ -4097,12 +4549,14 @@ impl Service { timeline_id: TimelineId, node: Node, jwt: Option, + ssl_ca_cert: Option, ) -> Result { tracing::info!( "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref(), ssl_ca_cert) + .map_err(|e| passthrough_api_error(&node, e))?; let res = client .timeline_delete(tenant_shard_id, timeline_id) .await; @@ -4129,6 +4583,7 @@ impl Service { timeline_id, node, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), )) }) .await?; @@ -4151,22 +4606,81 @@ impl Service { timeline_id, shard_zero_locations.latest.node, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), ) .await?; Ok(shard_zero_status) - }).await? + }).await?; + + self.tenant_timeline_delete_safekeepers(tenant_id, timeline_id) + .await?; + + status_code + } + /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. + async fn tenant_timeline_delete_safekeepers( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result<(), ApiError> { + let tl = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + let Some(tl) = tl else { + tracing::info!( + "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table, no deletions on safekeepers needed" + ); + return Ok(()); + }; + let all_sks = tl + .new_sk_set + .iter() + .flat_map(|sks| { + sks.iter() + .map(|sk| (*sk, SafekeeperTimelineOpKind::Exclude)) + }) + .chain( + tl.sk_set + .iter() + .map(|v| (*v, SafekeeperTimelineOpKind::Delete)), + ) + .collect::>(); + + // Schedule reconciliations + { + let mut locked = self.inner.write().unwrap(); + for (sk_id, kind) in all_sks { + let sk_id = NodeId(sk_id as u64); + let Some(sk) = locked.safekeepers.get(&sk_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {sk_id}" + ))); + }; + + let req = ScheduleRequest { + safekeeper: Box::new(sk.clone()), + // we don't use this for this kind, put a dummy value + host_list: Vec::new(), + tenant_id, + timeline_id, + generation: tl.generation as u32, + kind, + }; + locked.safekeeper_reconcilers.schedule_request(self, req); + } + } + Ok(()) } - /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this - /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound) + /// When you know the TenantId but not a specific shard, and would like to get the node holding shard 0. pub(crate) async fn tenant_shard0_node( &self, tenant_id: TenantId, ) -> Result<(Node, TenantShardId), ApiError> { - // Look up in-memory state and maybe use the node from there. - { + let tenant_shard_id = { let locked = self.inner.read().unwrap(); - let Some((tenant_shard_id, shard)) = locked + let Some((tenant_shard_id, _shard)) = locked .tenants .range(TenantShardId::tenant_range(tenant_id)) .next() @@ -4176,6 +4690,29 @@ impl Service { )); }; + *tenant_shard_id + }; + + self.tenant_shard_node(tenant_shard_id) + .await + .map(|node| (node, tenant_shard_id)) + } + + /// When you need to send an HTTP request to the pageserver that holds a shard of a tenant, this + /// function looks up and returns node. If the shard isn't found, returns Err(ApiError::NotFound) + pub(crate) async fn tenant_shard_node( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + // Look up in-memory state and maybe use the node from there. + { + let locked = self.inner.read().unwrap(); + let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard {tenant_shard_id} not found").into(), + )); + }; + let Some(intent_node_id) = shard.intent.get_attached() else { tracing::warn!( tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), @@ -4196,7 +4733,7 @@ impl Service { "Shard refers to nonexistent node" ))); }; - return Ok((node.clone(), *tenant_shard_id)); + return Ok(node.clone()); } }; @@ -4204,29 +4741,34 @@ impl Service { // generation state: this will reflect the progress of any ongoing migration. // Note that it is not guaranteed to _stay_ here, our caller must still handle // the case where they call through to the pageserver and get a 404. - let db_result = self.persistence.tenant_generations(tenant_id).await?; + let db_result = self + .persistence + .tenant_generations(tenant_shard_id.tenant_id) + .await?; let Some(ShardGenerationState { - tenant_shard_id, + tenant_shard_id: _, generation: _, generation_pageserver: Some(node_id), - }) = db_result.first() + }) = db_result + .into_iter() + .find(|s| s.tenant_shard_id == tenant_shard_id) else { // This can happen if we raced with a tenant deletion or a shard split. On a retry // the caller will either succeed (shard split case), get a proper 404 (deletion case), // or a conflict response (case where tenant was detached in background) return Err(ApiError::ResourceUnavailable( - "Shard {} not found in database, or is not attached".into(), + format!("Shard {tenant_shard_id} not found in database, or is not attached").into(), )); }; let locked = self.inner.read().unwrap(); - let Some(node) = locked.nodes.get(node_id) else { + let Some(node) = locked.nodes.get(&node_id) else { // This should never happen return Err(ApiError::InternalServerError(anyhow::anyhow!( "Shard refers to nonexistent node" ))); }; - Ok((node.clone(), *tenant_shard_id)) + Ok(node.clone()) } pub(crate) fn tenant_locate( @@ -4326,7 +4868,7 @@ impl Service { is_reconciling: shard.reconciler.is_some(), is_pending_compute_notification: shard.pending_compute_notification, is_splitting: matches!(shard.splitting, SplitState::Splitting), - scheduling_policy: *shard.get_scheduling_policy(), + scheduling_policy: shard.get_scheduling_policy(), preferred_az_id: shard.preferred_az().map(ToString::to_string), }) } @@ -4492,13 +5034,17 @@ impl Service { // if the original attachment location is offline. if let Some(node_id) = shard.intent.get_attached() { if !nodes.get(node_id).unwrap().is_available() { - tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}"); + tracing::info!( + "Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}" + ); shard.intent.demote_attached(scheduler, *node_id); } } for node_id in shard.intent.get_secondary().clone() { if !nodes.get(&node_id).unwrap().is_available() { - tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}"); + tracing::info!( + "Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}" + ); shard.intent.remove_secondary(scheduler, node_id); } } @@ -4526,7 +5072,9 @@ impl Service { // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have // removed child shards from our in-memory state and database, the reconciliation will implicitly remove // them from the node. - tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated."); + tracing::warn!( + "Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated." + ); continue; } @@ -4550,6 +5098,7 @@ impl Service { client.location_config(child_id, config, None, false).await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 10, Duration::from_secs(5), @@ -4971,7 +5520,10 @@ impl Service { // applies the new stripe size to the children. let mut shard_ident = shard_ident.unwrap(); if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size { - return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size))); + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", + shard_ident.stripe_size + ))); } shard_ident.stripe_size = new_stripe_size; @@ -5150,7 +5702,9 @@ impl Service { node.get_id(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - ); + self.config.ssl_ca_cert.clone(), + ) + .map_err(|e| passthrough_api_error(node, e))?; let response = client .tenant_shard_split( *parent_id, @@ -5199,7 +5753,7 @@ impl Service { // it doesn't match, but that requires more retry logic on this side) self.persistence - .complete_shard_split(tenant_id, old_shard_count) + .complete_shard_split(tenant_id, old_shard_count, new_shard_count) .await?; fail::fail_point!("shard-split-post-complete", |_| Err( @@ -5226,8 +5780,11 @@ impl Service { ) .await { - tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", - child_id, child_ps); + tracing::warn!( + "Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", + child_id, + child_ps + ); failed_notifications.push(child_id); } } @@ -5245,12 +5802,93 @@ impl Service { Ok((response, waiters)) } + /// A graceful migration: update the preferred node and let optimisation handle the migration + /// in the background (may take a long time as it will fully warm up a location before cutting over) + /// + /// Our external API calls this a 'prewarm=true' migration, but internally it isn't a special prewarm step: it's + /// just a migration that uses the same graceful procedure as our background scheduling optimisations would use. + fn tenant_shard_migrate_with_prewarm( + &self, + migrate_req: &TenantShardMigrateRequest, + shard: &mut TenantShard, + scheduler: &mut Scheduler, + schedule_context: ScheduleContext, + ) -> Result, ApiError> { + shard.set_preferred_node(Some(migrate_req.node_id)); + + // Generate whatever the initial change to the intent is: this could be creation of a secondary, or + // cutting over to an existing secondary. Caller is responsible for validating this before applying it, + // e.g. by checking secondary is warm enough. + Ok(shard.optimize_attachment(scheduler, &schedule_context)) + } + + /// Immediate migration: directly update the intent state and kick off a reconciler + fn tenant_shard_migrate_immediate( + &self, + migrate_req: &TenantShardMigrateRequest, + nodes: &Arc>, + shard: &mut TenantShard, + scheduler: &mut Scheduler, + ) -> Result, ApiError> { + // Non-graceful migration: update the intent state immediately + let old_attached = *shard.intent.get_attached(); + match shard.policy { + PlacementPolicy::Attached(n) => { + // If our new attached node was a secondary, it no longer should be. + shard + .intent + .remove_secondary(scheduler, migrate_req.node_id); + + shard + .intent + .set_attached(scheduler, Some(migrate_req.node_id)); + + // If we were already attached to something, demote that to a secondary + if let Some(old_attached) = old_attached { + if n > 0 { + // Remove other secondaries to make room for the location we'll demote + while shard.intent.get_secondary().len() >= n { + shard.intent.pop_secondary(scheduler); + } + + shard.intent.push_secondary(scheduler, old_attached); + } + } + } + PlacementPolicy::Secondary => { + shard.intent.clear(scheduler); + shard.intent.push_secondary(scheduler, migrate_req.node_id); + } + PlacementPolicy::Detached => { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" + ))); + } + } + + tracing::info!("Migrating: new intent {:?}", shard.intent); + shard.sequence = shard.sequence.next(); + shard.set_preferred_node(None); // Abort any in-flight graceful migration + Ok(self.maybe_configured_reconcile_shard( + shard, + nodes, + (&migrate_req.migration_config).into(), + )) + } + pub(crate) async fn tenant_shard_migrate( &self, tenant_shard_id: TenantShardId, migrate_req: TenantShardMigrateRequest, ) -> Result { - let waiter = { + // Depending on whether the migration is a change and whether it's graceful or immediate, we might + // get a different outcome to handle + enum MigrationOutcome { + Optimization(Option), + Reconcile(Option), + } + + let outcome = { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); @@ -5261,67 +5899,139 @@ impl Service { ))); }; + // Migration to unavavailable node requires force flag if !node.is_available() { - // Warn but proceed: the caller may intend to manually adjust the placement of - // a shard even if the node is down, e.g. if intervening during an incident. - tracing::warn!("Migrating to unavailable node {node}"); + if migrate_req.migration_config.override_scheduler { + // Warn but proceed: the caller may intend to manually adjust the placement of + // a shard even if the node is down, e.g. if intervening during an incident. + tracing::warn!("Forcibly migrating to unavailable node {node}"); + } else { + tracing::warn!("Node {node} is unavailable, refusing migration"); + return Err(ApiError::PreconditionFailed( + format!("Node {node} is unavailable").into_boxed_str(), + )); + } } + // Calculate the ScheduleContext for this tenant + let mut schedule_context = ScheduleContext::default(); + for (_shard_id, shard) in + tenants.range(TenantShardId::tenant_range(tenant_shard_id.tenant_id)) + { + schedule_context.avoid(&shard.intent.all_pageservers()); + } + + // Look up the specific shard we will migrate let Some(shard) = tenants.get_mut(&tenant_shard_id) else { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant shard not found").into(), )); }; + // Migration to a node with unfavorable scheduling score requires a force flag, because it might just + // be migrated back by the optimiser. + if let Some(better_node) = shard.find_better_location::( + scheduler, + &schedule_context, + migrate_req.node_id, + &[], + ) { + if !migrate_req.migration_config.override_scheduler { + return Err(ApiError::PreconditionFailed( + "Migration to a worse-scoring node".into(), + )); + } else { + tracing::info!( + "Migrating to a worse-scoring node {} (optimiser would prefer {better_node})", + migrate_req.node_id + ); + } + } + + if let Some(origin_node_id) = migrate_req.origin_node_id { + if shard.intent.get_attached() != &Some(origin_node_id) { + return Err(ApiError::PreconditionFailed( + format!( + "Migration expected to originate from {} but shard is on {:?}", + origin_node_id, + shard.intent.get_attached() + ) + .into(), + )); + } + } + if shard.intent.get_attached() == &Some(migrate_req.node_id) { // No-op case: we will still proceed to wait for reconciliation in case it is // incomplete from an earlier update to the intent. tracing::info!("Migrating: intent is unchanged {:?}", shard.intent); + + // An instruction to migrate to the currently attached node should + // cancel any pending graceful migration + shard.set_preferred_node(None); + + MigrationOutcome::Reconcile(self.maybe_configured_reconcile_shard( + shard, + nodes, + (&migrate_req.migration_config).into(), + )) + } else if migrate_req.migration_config.prewarm { + MigrationOutcome::Optimization(self.tenant_shard_migrate_with_prewarm( + &migrate_req, + shard, + scheduler, + schedule_context, + )?) } else { - let old_attached = *shard.intent.get_attached(); - - match shard.policy { - PlacementPolicy::Attached(n) => { - // If our new attached node was a secondary, it no longer should be. - shard.intent.remove_secondary(scheduler, migrate_req.node_id); - - shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); - - // If we were already attached to something, demote that to a secondary - if let Some(old_attached) = old_attached { - if n > 0 { - // Remove other secondaries to make room for the location we'll demote - while shard.intent.get_secondary().len() >= n { - shard.intent.pop_secondary(scheduler); - } - - shard.intent.push_secondary(scheduler, old_attached); - } - } - } - PlacementPolicy::Secondary => { - shard.intent.clear(scheduler); - shard.intent.push_secondary(scheduler, migrate_req.node_id); - } - PlacementPolicy::Detached => { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" - ))) - } - } - - tracing::info!("Migrating: new intent {:?}", shard.intent); - shard.sequence = shard.sequence.next(); + MigrationOutcome::Reconcile(self.tenant_shard_migrate_immediate( + &migrate_req, + nodes, + shard, + scheduler, + )?) } - - let reconciler_config = match migrate_req.migration_config { - Some(cfg) => (&cfg).into(), - None => ReconcilerConfig::new(ReconcilerPriority::High), - }; - - self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config) }; + // We may need to validate + apply an optimisation, or we may need to just retrive a reconcile waiter + let waiter = match outcome { + MigrationOutcome::Optimization(Some(optimization)) => { + // Validate and apply the optimization -- this would happen anyway in background reconcile loop, but + // we might as well do it more promptly as this is a direct external request. + let mut validated = self + .optimize_all_validate(vec![(tenant_shard_id, optimization)]) + .await; + if let Some((_shard_id, optimization)) = validated.pop() { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + // Rare but possible: tenant is removed between generating optimisation and validating it. + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + if !shard.apply_optimization(scheduler, optimization) { + // This can happen but is unusual enough to warn on: something else changed in the shard that made the optimisation stale + // and therefore not applied. + tracing::warn!( + "Schedule optimisation generated during graceful migration was not applied, shard changed?" + ); + } + self.maybe_configured_reconcile_shard( + shard, + nodes, + (&migrate_req.migration_config).into(), + ) + } else { + None + } + } + MigrationOutcome::Optimization(None) => None, + MigrationOutcome::Reconcile(waiter) => waiter, + }; + + // Finally, wait for any reconcile we started to complete. In the case of immediate-mode migrations to cold + // locations, this has a good chance of timing out. if let Some(waiter) = waiter { waiter.wait_timeout(RECONCILE_TIMEOUT).await?; } else { @@ -5367,7 +6077,9 @@ impl Service { shard.intent ); } else if shard.intent.get_attached() == &Some(migrate_req.node_id) { - tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary"); + tracing::info!( + "Migrating secondary to {node}: already attached where we were asked to create a secondary" + ); } else { let old_secondaries = shard.intent.get_secondary().clone(); for secondary in old_secondaries { @@ -5476,7 +6188,9 @@ impl Service { node.get_id(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - ); + self.config.ssl_ca_cert.clone(), + ) + .map_err(|e| passthrough_api_error(&node, e))?; let scan_result = client .tenant_scan_remote_storage(tenant_id) @@ -5880,7 +6594,7 @@ impl Service { return Err(ApiError::InternalServerError(anyhow::anyhow!( "{} attached as primary+secondary on the same node", tid - ))) + ))); } (true, false) => Some(false), (false, true) => Some(true), @@ -6899,6 +7613,10 @@ impl Service { ShardSchedulingPolicy::Active => { // Ok to do optimization } + ShardSchedulingPolicy::Essential if shard.get_preferred_node().is_some() => { + // Ok to do optimization: we are executing a graceful migration that + // has set preferred_node + } ShardSchedulingPolicy::Essential | ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => { @@ -6923,12 +7641,16 @@ impl Service { // Check that maybe_optimizable doesn't disagree with the actual optimization functions. // Only do this in testing builds because it is not a correctness-critical check, so we shouldn't // panic in prod if we hit this, or spend cycles on it in prod. - assert!(shard - .optimize_attachment(scheduler, &schedule_context) - .is_none()); - assert!(shard - .optimize_secondary(scheduler, &schedule_context) - .is_none()); + assert!( + shard + .optimize_attachment(scheduler, &schedule_context) + .is_none() + ); + assert!( + shard + .optimize_secondary(scheduler, &schedule_context) + .is_none() + ); } continue; } @@ -6984,7 +7706,9 @@ impl Service { } Some(node) => { if !node.is_available() { - tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable"); + tracing::info!( + "Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable" + ); } else { // Accumulate optimizations that require fetching secondary status, so that we can execute these // remote API requests concurrently. @@ -7030,7 +7754,9 @@ impl Service { { match secondary_status { Err(e) => { - tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}"); + tracing::info!( + "Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}" + ); } Ok(progress) => { // We require secondary locations to have less than 10GiB of downloads pending before we will use @@ -7043,7 +7769,9 @@ impl Service { || progress.bytes_total - progress.bytes_downloaded > DOWNLOAD_FRESHNESS_THRESHOLD { - tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + tracing::info!( + "Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}" + ); #[cfg(feature = "testing")] if progress.heatmap_mtime.is_none() { @@ -7104,6 +7832,7 @@ impl Service { .with_client_retries( |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7140,6 +7869,7 @@ impl Service { .await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7149,99 +7879,89 @@ impl Service { { Some(Err(e)) => { tracing::info!( - "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}" - ); + "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}" + ); } None => { - tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}"); + tracing::info!( + "Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}" + ); } Some(Ok(progress)) => { - tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}"); + tracing::info!( + "Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}" + ); } } } } - /// Look for shards which are oversized and in need of splitting + /// Asynchronously split a tenant that's eligible for automatic splits: + /// + /// * The tenant is unsharded. + /// * The logical size of its largest timeline exceeds split_threshold. + /// * The tenant's scheduling policy is active. + /// + /// At most one tenant will be split per call: the one with the largest max logical size. It + /// will split 1 → 8 shards. + /// + /// An unsharded tenant will get DEFAULT_STRIPE_SIZE, regardless of what its ShardIdentity says. + /// A sharded tenant will retain its stripe size, as splits do not allow changing it. + /// + /// TODO: consider splitting based on total logical size rather than max logical size. + /// + /// TODO: consider spawning multiple splits in parallel: this is only called once every 20 + /// seconds, so a large backlog can take a long time, and if a tenant fails to split it will + /// block all other splits. async fn autosplit_tenants(self: &Arc) { let Some(split_threshold) = self.config.split_threshold else { - // Auto-splitting is disabled + return; // auto-splits are disabled + }; + if split_threshold == 0 { return; - }; - - let nodes = self.inner.read().unwrap().nodes.clone(); - - const SPLIT_TO_MAX: ShardCount = ShardCount::new(8); - - let mut top_n = Vec::new(); - - // Call into each node to look for big tenants - let top_n_request = TopTenantShardsRequest { - // We currently split based on logical size, for simplicity: logical size is a signal of - // the user's intent to run a large database, whereas physical/resident size can be symptoms - // of compaction issues. Eventually we should switch to using resident size to bound the - // disk space impact of one shard. - order_by: models::TenantSorting::MaxLogicalSize, - limit: 10, - where_shards_lt: Some(SPLIT_TO_MAX), - where_gt: Some(split_threshold), - }; - for node in nodes.values() { - let request_ref = &top_n_request; - match node - .with_client_retries( - |client| async move { - let request = request_ref.clone(); - client.top_tenant_shards(request.clone()).await - }, - &self.config.pageserver_jwt_token, - 3, - 3, - Duration::from_secs(5), - &self.cancel, - ) - .await - { - Some(Ok(node_top_n)) => { - top_n.extend(node_top_n.shards.into_iter()); - } - Some(Err(mgmt_api::Error::Cancelled)) => { - continue; - } - Some(Err(e)) => { - tracing::warn!("Failed to fetch top N tenants from {node}: {e}"); - continue; - } - None => { - // Node is shutting down - continue; - } - }; } - // Pick the biggest tenant to split first - top_n.sort_by_key(|i| i.resident_size); + // Fetch the largest eligible shards by logical size. + const MAX_SHARDS: ShardCount = ShardCount::new(8); - // Filter out tenants in a prohibiting scheduling mode + let mut top_n = self + .get_top_tenant_shards(&TopTenantShardsRequest { + order_by: TenantSorting::MaxLogicalSize, + limit: 10, + where_shards_lt: Some(MAX_SHARDS), + where_gt: Some(split_threshold), + }) + .await; + + // Filter out tenants in a prohibiting scheduling mode. { - let locked = self.inner.read().unwrap(); + let state = self.inner.read().unwrap(); top_n.retain(|i| { - if let Some(shard) = locked.tenants.get(&i.id) { - matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) - } else { - false - } + let policy = state.tenants.get(&i.id).map(|s| s.get_scheduling_policy()); + policy == Some(ShardSchedulingPolicy::Active) }); } let Some(split_candidate) = top_n.into_iter().next() else { - tracing::debug!("No split-elegible shards found"); + debug!("No split-elegible shards found"); return; }; - // We spawn a task to run this, so it's exactly like some external API client requesting it. We don't - // want to block the background reconcile loop on this. - tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}"); + // We spawn a task to run this, so it's exactly like some external API client requesting it. + // We don't want to block the background reconcile loop on this. + info!( + "Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}" + ); + + // Retain the stripe size of sharded tenants, as splits don't allow changing it. Otherwise, + // use DEFAULT_STRIPE_SIZE for unsharded tenants -- their stripe size doesn't really matter, + // and if we change the default stripe size we want to use the new default rather than an + // old, persisted stripe size. + let new_stripe_size = match split_candidate.id.shard_count.count() { + 0 => panic!("invalid shard count 0"), + 1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE), + 2.. => None, + }; let this = self.clone(); tokio::spawn( @@ -7250,29 +7970,71 @@ impl Service { .tenant_shard_split( split_candidate.id.tenant_id, TenantShardSplitRequest { - // Always split to the max number of shards: this avoids stepping through - // intervening shard counts and encountering the overrhead of a split+cleanup - // each time as a tenant grows, and is not too expensive because our max shard - // count is relatively low anyway. - // This policy will be adjusted in future once we support higher shard count. - new_shard_count: SPLIT_TO_MAX.literal(), - new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE), + // Always split to the max number of shards: this avoids stepping + // through intervening shard counts and encountering the overhead of a + // split+cleanup each time as a tenant grows, and is not too expensive + // because our max shard count is relatively low anyway. This policy + // will be adjusted in future once we support higher shard count. + new_shard_count: MAX_SHARDS.literal(), + new_stripe_size, }, ) .await { - Ok(_) => { - tracing::info!("Successful auto-split"); - } - Err(e) => { - tracing::error!("Auto-split failed: {e}"); - } + Ok(_) => info!("Successful auto-split"), + Err(err) => error!("Auto-split failed: {err}"), } } - .instrument(tracing::info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)), + .instrument(info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)), ); } + /// Fetches the top tenant shards from every node, in descending order of + /// max logical size. Any node errors will be logged and ignored. + async fn get_top_tenant_shards( + &self, + request: &TopTenantShardsRequest, + ) -> Vec { + let nodes = self + .inner + .read() + .unwrap() + .nodes + .values() + .cloned() + .collect_vec(); + + let mut futures = FuturesUnordered::new(); + for node in nodes { + futures.push(async move { + node.with_client_retries( + |client| async move { client.top_tenant_shards(request.clone()).await }, + &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, + 3, + 3, + Duration::from_secs(5), + &self.cancel, + ) + .await + }); + } + + let mut top = Vec::new(); + while let Some(output) = futures.next().await { + match output { + Some(Ok(response)) => top.extend(response.shards), + Some(Err(mgmt_api::Error::Cancelled)) => {} + Some(Err(err)) => warn!("failed to fetch top tenants: {err}"), + None => {} // node is shutting down + } + } + + top.sort_by_key(|i| i.max_logical_size); + top.reverse(); + top + } + /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but /// also wait for any generated Reconcilers to complete. Calling this until it returns zero should /// put the system into a quiescent state where future background reconciliations won't do anything. @@ -7368,6 +8130,7 @@ impl Service { .with_client_retries( |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 3, Duration::from_millis(250), @@ -7956,6 +8719,68 @@ impl Service { global_observed } + /// Choose safekeepers for the new timeline: 3 in different azs. + pub(crate) async fn safekeepers_for_new_timeline( + &self, + ) -> Result, ApiError> { + let mut all_safekeepers = { + let locked = self.inner.read().unwrap(); + locked + .safekeepers + .iter() + .filter_map(|sk| { + if sk.1.scheduling_policy() != SkSchedulingPolicy::Active { + // If we don't want to schedule stuff onto the safekeeper, respect that. + return None; + } + let utilization_opt = if let SafekeeperState::Available { + last_seen_at: _, + utilization, + } = sk.1.availability() + { + Some(utilization) + } else { + // non-available safekeepers still get a chance for new timelines, + // but put them last in the list. + None + }; + let info = SafekeeperInfo { + hostname: sk.1.skp.host.clone(), + id: NodeId(sk.1.skp.id as u64), + }; + Some((utilization_opt, info, sk.1.skp.availability_zone_id.clone())) + }) + .collect::>() + }; + all_safekeepers.sort_by_key(|sk| { + ( + sk.0.as_ref() + .map(|ut| ut.timeline_count) + .unwrap_or(u64::MAX), + // Use the id to decide on equal scores for reliability + sk.1.id.0, + ) + }); + let mut sks = Vec::new(); + let mut azs = HashSet::new(); + for (_sk_util, sk_info, az_id) in all_safekeepers.iter() { + if !azs.insert(az_id) { + continue; + } + sks.push(sk_info.clone()); + if sks.len() == 3 { + break; + } + } + if sks.len() == 3 { + Ok(sks) + } else { + Err(ApiError::InternalServerError(anyhow::anyhow!( + "couldn't find three safekeepers in different AZs for new timeline" + ))) + } + } + pub(crate) async fn safekeepers_list( &self, ) -> Result, DatabaseError> { @@ -7984,24 +8809,41 @@ impl Service { pub(crate) async fn upsert_safekeeper( &self, record: crate::persistence::SafekeeperUpsert, - ) -> Result<(), DatabaseError> { + ) -> Result<(), ApiError> { let node_id = NodeId(record.id as u64); + let use_https = self.config.use_https_safekeeper_api; + + if use_https && record.https_port.is_none() { + return Err(ApiError::PreconditionFailed( + format!( + "cannot upsert safekeeper {node_id}: \ + https is enabled, but https port is not specified" + ) + .into(), + )); + } + self.persistence.safekeeper_upsert(record.clone()).await?; { let mut locked = self.inner.write().unwrap(); let mut safekeepers = (*locked.safekeepers).clone(); match safekeepers.entry(node_id) { - std::collections::hash_map::Entry::Occupied(mut entry) => { - entry.get_mut().update_from_record(record); - } + std::collections::hash_map::Entry::Occupied(mut entry) => entry + .get_mut() + .update_from_record(record) + .expect("all preconditions should be checked before upsert to database"), std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(Safekeeper::from_persistence( - crate::persistence::SafekeeperPersistence::from_upsert( - record, - SkSchedulingPolicy::Pause, - ), - CancellationToken::new(), - )); + entry.insert( + Safekeeper::from_persistence( + crate::persistence::SafekeeperPersistence::from_upsert( + record, + SkSchedulingPolicy::Pause, + ), + CancellationToken::new(), + use_https, + ) + .expect("all preconditions should be checked before upsert to database"), + ); } } locked.safekeepers = Arc::new(safekeepers); @@ -8027,6 +8869,13 @@ impl Service { .ok_or(DatabaseError::Logical("Not found".to_string()))?; sk.set_scheduling_policy(scheduling_policy); + match scheduling_policy { + SkSchedulingPolicy::Active => (), + SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => { + locked.safekeeper_reconcilers.cancel_safekeeper(node_id); + } + } + locked.safekeepers = Arc::new(safekeepers); } Ok(()) @@ -8050,10 +8899,11 @@ impl Service { let mut updated_in_mem_and_db = Vec::default(); let mut locked = self.inner.write().unwrap(); + let state = locked.deref_mut(); for (tid, az_id) in updated { - let shard = locked.tenants.get_mut(&tid); + let shard = state.tenants.get_mut(&tid); if let Some(shard) = shard { - shard.set_preferred_az(az_id); + shard.set_preferred_az(&mut state.scheduler, az_id); updated_in_mem_and_db.push(tid); } } diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index 25a0fab5ca..a0419e0205 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -1,8 +1,6 @@ -use std::{ - collections::{BTreeMap, HashMap}, - sync::Arc, - time::Duration, -}; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; +use std::time::Duration; use pageserver_api::controller_api::ShardSchedulingPolicy; use rand::seq::SliceRandom; @@ -48,48 +46,51 @@ impl ChaosInjector { } } + fn get_cron_interval_sleep_future(&self) -> Option { + if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab { + match cron_to_next_duration(chaos_exit_crontab) { + Ok(interval_exit) => Some(interval_exit), + Err(e) => { + tracing::error!("Error processing the cron schedule: {e}"); + None + } + } + } else { + None + } + } + pub async fn run(&mut self, cancel: CancellationToken) { let mut interval = tokio::time::interval(self.interval); - let cron_interval = { - if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab { - match cron_to_next_duration(chaos_exit_crontab) { - Ok(interval_exit) => Some(interval_exit), - Err(e) => { - tracing::error!("Error processing the cron schedule: {e}"); - None - } - } - } else { - None - } - }; + #[derive(Debug)] enum ChaosEvent { ShuffleTenant, ForceKill, } - let chaos_type = tokio::select! { - _ = interval.tick() => { - ChaosEvent::ShuffleTenant - } - Some(_) = maybe_sleep(cron_interval) => { - ChaosEvent::ForceKill - } - _ = cancel.cancelled() => { - tracing::info!("Shutting down"); - return; - } - }; - - match chaos_type { - ChaosEvent::ShuffleTenant => { - self.inject_chaos().await; - } - ChaosEvent::ForceKill => { - self.force_kill().await; + loop { + let cron_interval = self.get_cron_interval_sleep_future(); + let chaos_type = tokio::select! { + _ = interval.tick() => { + ChaosEvent::ShuffleTenant + } + Some(_) = maybe_sleep(cron_interval) => { + ChaosEvent::ForceKill + } + _ = cancel.cancelled() => { + tracing::info!("Shutting down"); + return; + } + }; + tracing::info!("Chaos iteration: {chaos_type:?}..."); + match chaos_type { + ChaosEvent::ShuffleTenant => { + self.inject_chaos().await; + } + ChaosEvent::ForceKill => { + self.force_kill().await; + } } } - - tracing::info!("Chaos iteration..."); } /// If a shard has a secondary and attached location, then re-assign the secondary to be @@ -176,12 +177,19 @@ impl ChaosInjector { let mut victims = Vec::with_capacity(batch_size); if out_of_home_az.len() >= batch_size { - tracing::info!("Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", out_of_home_az.len()); + tracing::info!( + "Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", + out_of_home_az.len() + ); out_of_home_az.shuffle(&mut thread_rng()); victims.extend(out_of_home_az.into_iter().take(batch_size)); } else { - tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", out_of_home_az.len(), std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len())); + tracing::info!( + "Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", + out_of_home_az.len(), + std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len()) + ); victims.extend(out_of_home_az); in_home_az.shuffle(&mut thread_rng()); diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs index dd6913e988..c4784e5e36 100644 --- a/storage_controller/src/service/context_iterator.rs +++ b/storage_controller/src/service/context_iterator.rs @@ -54,17 +54,16 @@ impl<'a> Iterator for TenantShardContextIterator<'a> { #[cfg(test)] mod tests { - use std::{collections::BTreeMap, str::FromStr}; + use std::collections::BTreeMap; + use std::str::FromStr; use pageserver_api::controller_api::PlacementPolicy; use utils::shard::{ShardCount, ShardNumber}; - use crate::{ - scheduler::test_utils::make_test_nodes, service::Scheduler, - tenant_shard::tests::make_test_tenant_with_id, - }; - use super::*; + use crate::scheduler::test_utils::make_test_nodes; + use crate::service::Scheduler; + use crate::tenant_shard::tests::make_test_tenant_with_id; #[test] fn test_context_iterator() { diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs new file mode 100644 index 0000000000..4fa465c307 --- /dev/null +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -0,0 +1,340 @@ +use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration}; + +use clashmap::{ClashMap, Entry}; +use safekeeper_api::models::PullTimelineRequest; +use safekeeper_client::mgmt_api; +use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender}; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; +use utils::{ + id::{NodeId, TenantId, TimelineId}, + logging::SecretString, +}; + +use crate::{ + persistence::SafekeeperTimelineOpKind, safekeeper::Safekeeper, + safekeeper_client::SafekeeperClient, +}; + +use super::Service; + +pub(crate) struct SafekeeperReconcilers { + cancel: CancellationToken, + reconcilers: HashMap, +} + +impl SafekeeperReconcilers { + pub fn new(cancel: CancellationToken) -> Self { + SafekeeperReconcilers { + cancel, + reconcilers: HashMap::new(), + } + } + pub(crate) fn schedule_request_vec( + &mut self, + service: &Arc, + reqs: Vec, + ) { + for req in reqs { + self.schedule_request(service, req); + } + } + pub(crate) fn schedule_request(&mut self, service: &Arc, req: ScheduleRequest) { + let node_id = req.safekeeper.get_id(); + let reconciler_handle = self.reconcilers.entry(node_id).or_insert_with(|| { + SafekeeperReconciler::spawn(self.cancel.child_token(), service.clone()) + }); + reconciler_handle.schedule_reconcile(req); + } + pub(crate) fn cancel_safekeeper(&mut self, node_id: NodeId) { + if let Some(handle) = self.reconcilers.remove(&node_id) { + handle.cancel.cancel(); + } + } +} + +/// Initial load of the pending operations from the db +pub(crate) async fn load_schedule_requests( + service: &Arc, + safekeepers: &HashMap, +) -> anyhow::Result> { + let pending_ops = service.persistence.list_pending_ops(None).await?; + let mut res = Vec::with_capacity(pending_ops.len()); + for op_persist in pending_ops { + let node_id = NodeId(op_persist.sk_id as u64); + let Some(sk) = safekeepers.get(&node_id) else { + // This shouldn't happen, at least the safekeeper should exist as decomissioned. + tracing::warn!( + tenant_id = op_persist.tenant_id, + timeline_id = op_persist.timeline_id, + "couldn't find safekeeper with pending op id {node_id} in list of stored safekeepers" + ); + continue; + }; + let sk = Box::new(sk.clone()); + let tenant_id = TenantId::from_str(&op_persist.tenant_id)?; + let timeline_id = TimelineId::from_str(&op_persist.timeline_id)?; + let host_list = match op_persist.op_kind { + SafekeeperTimelineOpKind::Delete => Vec::new(), + SafekeeperTimelineOpKind::Exclude => Vec::new(), + SafekeeperTimelineOpKind::Pull => { + // TODO this code is super hacky, it doesn't take migrations into account + let timeline_persist = service + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + let Some(timeline_persist) = timeline_persist else { + // This shouldn't happen, the timeline should still exist + tracing::warn!( + tenant_id = op_persist.tenant_id, + timeline_id = op_persist.timeline_id, + "couldn't find timeline for corresponding pull op" + ); + continue; + }; + timeline_persist + .sk_set + .iter() + .filter_map(|sk_id| { + let other_node_id = NodeId(*sk_id as u64); + if node_id == other_node_id { + // We obviously don't want to pull from ourselves + return None; + } + let Some(sk) = safekeepers.get(&other_node_id) else { + tracing::warn!( + "couldnt find safekeeper with pending op id {other_node_id}, not pulling from it" + ); + return None; + }; + Some((other_node_id, sk.base_url())) + }) + .collect::>() + } + }; + let req = ScheduleRequest { + safekeeper: sk, + host_list, + tenant_id, + timeline_id, + generation: op_persist.generation as u32, + kind: op_persist.op_kind, + }; + res.push(req); + } + Ok(res) +} + +pub(crate) struct ScheduleRequest { + pub(crate) safekeeper: Box, + pub(crate) host_list: Vec<(NodeId, String)>, + pub(crate) tenant_id: TenantId, + pub(crate) timeline_id: TimelineId, + pub(crate) generation: u32, + pub(crate) kind: SafekeeperTimelineOpKind, +} + +struct ReconcilerHandle { + tx: UnboundedSender<(ScheduleRequest, Arc)>, + ongoing_tokens: Arc>>, + cancel: CancellationToken, +} + +impl ReconcilerHandle { + /// Obtain a new token slot, cancelling any existing reconciliations for that timeline + fn new_token_slot( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Arc { + let entry = self.ongoing_tokens.entry((tenant_id, timeline_id)); + if let Entry::Occupied(entry) = &entry { + let cancel: &CancellationToken = entry.get(); + cancel.cancel(); + } + entry.insert(Arc::new(self.cancel.child_token())).clone() + } + fn schedule_reconcile(&self, req: ScheduleRequest) { + let cancel = self.new_token_slot(req.tenant_id, req.timeline_id); + let hostname = req.safekeeper.skp.host.clone(); + if let Err(err) = self.tx.send((req, cancel)) { + tracing::info!("scheduling request onto {hostname} returned error: {err}"); + } + } +} + +pub(crate) struct SafekeeperReconciler { + service: Arc, + rx: UnboundedReceiver<(ScheduleRequest, Arc)>, + cancel: CancellationToken, +} + +impl SafekeeperReconciler { + fn spawn(cancel: CancellationToken, service: Arc) -> ReconcilerHandle { + // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking. + let (tx, rx) = mpsc::unbounded_channel(); + let mut reconciler = SafekeeperReconciler { + service, + rx, + cancel: cancel.clone(), + }; + let handle = ReconcilerHandle { + tx, + ongoing_tokens: Arc::new(ClashMap::new()), + cancel, + }; + tokio::spawn(async move { reconciler.run().await }); + handle + } + async fn run(&mut self) { + loop { + // TODO add parallelism with semaphore here + let req = tokio::select! { + req = self.rx.recv() => req, + _ = self.cancel.cancelled() => break, + }; + let Some((req, req_cancel)) = req else { break }; + if req_cancel.is_cancelled() { + continue; + } + + let kind = req.kind; + let tenant_id = req.tenant_id; + let timeline_id = req.timeline_id; + self.reconcile_one(req, req_cancel) + .instrument(tracing::info_span!( + "reconcile_one", + ?kind, + %tenant_id, + %timeline_id + )) + .await; + } + } + async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: Arc) { + let req_host = req.safekeeper.skp.host.clone(); + match req.kind { + SafekeeperTimelineOpKind::Pull => { + let our_id = req.safekeeper.get_id(); + let http_hosts = req + .host_list + .iter() + .filter(|(node_id, _hostname)| *node_id != our_id) + .map(|(_, hostname)| hostname.clone()) + .collect::>(); + let pull_req = PullTimelineRequest { + http_hosts, + tenant_id: req.tenant_id, + timeline_id: req.timeline_id, + }; + self.reconcile_inner( + req, + async |client| client.pull_timeline(&pull_req).await, + |resp| { + tracing::info!( + "pulled timeline from {} onto {req_host}", + resp.safekeeper_host, + ); + }, + req_cancel, + ) + .await; + } + SafekeeperTimelineOpKind::Exclude => { + // TODO actually exclude instead of delete here + let tenant_id = req.tenant_id; + let timeline_id = req.timeline_id; + self.reconcile_inner( + req, + async |client| client.delete_timeline(tenant_id, timeline_id).await, + |_resp| { + tracing::info!("deleted timeline from {req_host}"); + }, + req_cancel, + ) + .await; + } + SafekeeperTimelineOpKind::Delete => { + let tenant_id = req.tenant_id; + let timeline_id = req.timeline_id; + self.reconcile_inner( + req, + async |client| client.delete_timeline(tenant_id, timeline_id).await, + |_resp| { + tracing::info!("deleted timeline from {req_host}"); + }, + req_cancel, + ) + .await; + } + } + } + async fn reconcile_inner( + &self, + req: ScheduleRequest, + closure: impl Fn(SafekeeperClient) -> F, + log_success: impl FnOnce(T) -> U, + req_cancel: Arc, + ) where + F: Future>, + { + let jwt = self + .service + .config + .safekeeper_jwt_token + .clone() + .map(SecretString::from); + let ssl_ca_cert = self.service.config.ssl_ca_cert.clone(); + loop { + let res = req + .safekeeper + .with_client_retries( + |client| { + let closure = &closure; + async move { closure(client).await } + }, + &jwt, + &ssl_ca_cert, + 3, + 10, + Duration::from_secs(10), + &req_cancel, + ) + .await; + match res { + Ok(resp) => { + log_success(resp); + let res = self + .service + .persistence + .remove_pending_op( + req.tenant_id, + req.timeline_id, + req.safekeeper.get_id(), + req.generation, + ) + .await; + if let Err(err) = res { + tracing::info!( + "couldn't remove reconciliation request onto {} from persistence: {err:?}", + req.safekeeper.skp.host + ); + } + return; + } + Err(mgmt_api::Error::Cancelled) => { + // On cancellation, the code that issued it will take care of removing db entries (if needed) + return; + } + Err(e) => { + tracing::info!( + "Reconcile attempt for safekeeper {} failed, retrying after sleep: {e:?}", + req.safekeeper.skp.host + ); + const SLEEP_TIME: Duration = Duration::from_secs(1); + tokio::time::sleep(SLEEP_TIME).await; + } + } + } + } +} diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 56a36dc2df..80f42e04a9 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1,50 +1,39 @@ -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; -use crate::{ - metrics::{ - self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, - }, - persistence::TenantShardPersistence, - reconciler::{ReconcileUnits, ReconcilerConfig}, - scheduler::{ - AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore, - RefCountUpdate, ScheduleContext, SecondaryShardTag, ShardTag, - }, - service::ReconcileResultRequest, -}; use futures::future::{self, Either}; use itertools::Itertools; use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy, ShardSchedulingPolicy}; -use pageserver_api::{ - models::{LocationConfig, LocationConfigMode, TenantConfig}, - shard::{ShardIdentity, TenantShardId}, -}; +use pageserver_api::models::{LocationConfig, LocationConfigMode, TenantConfig}; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use serde::{Deserialize, Serialize}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; -use tracing::{instrument, Instrument}; -use utils::{ - generation::Generation, - id::NodeId, - seqwait::{SeqWait, SeqWaitError}, - shard::ShardCount, - sync::gate::GateGuard, -}; +use tracing::{Instrument, instrument}; +use utils::generation::Generation; +use utils::id::NodeId; +use utils::seqwait::{SeqWait, SeqWaitError}; +use utils::shard::ShardCount; +use utils::sync::gate::GateGuard; -use crate::{ - compute_hook::ComputeHook, - node::Node, - persistence::{split_state::SplitState, Persistence}, - reconciler::{ - attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState, - }, - scheduler::{ScheduleError, Scheduler}, - service, Sequence, +use crate::compute_hook::ComputeHook; +use crate::metrics::{ + self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, }; +use crate::node::Node; +use crate::persistence::split_state::SplitState; +use crate::persistence::{Persistence, TenantShardPersistence}; +use crate::reconciler::{ + ReconcileError, ReconcileUnits, Reconciler, ReconcilerConfig, TargetState, + attached_location_conf, secondary_location_conf, +}; +use crate::scheduler::{ + AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore, + RefCountUpdate, ScheduleContext, ScheduleError, Scheduler, SecondaryShardTag, ShardTag, +}; +use crate::service::ReconcileResultRequest; +use crate::{Sequence, service}; /// Serialization helper fn read_last_error(v: &std::sync::Mutex>, serializer: S) -> Result @@ -143,6 +132,10 @@ pub(crate) struct TenantShard { /// of state that we publish externally in an eventually consistent way. pub(crate) pending_compute_notification: bool, + /// To do a graceful migration, set this field to the destination pageserver, and optimization + /// functions will consider this node the best location and react appropriately. + preferred_node: Option, + // Support/debug tool: if something is going wrong or flapping with scheduling, this may // be set to a non-active state to avoid making changes while the issue is fixed. scheduling_policy: ShardSchedulingPolicy, @@ -335,6 +328,37 @@ impl IntentState { false } } + + pub(crate) fn set_preferred_az( + &mut self, + scheduler: &mut Scheduler, + preferred_az: Option, + ) { + let new_az = preferred_az.as_ref(); + let old_az = self.preferred_az_id.as_ref(); + + if old_az != new_az { + if let Some(node_id) = self.attached { + scheduler.update_node_ref_counts( + node_id, + new_az, + RefCountUpdate::ChangePreferredAzFrom(old_az), + ); + } + for node_id in &self.secondary { + scheduler.update_node_ref_counts( + *node_id, + new_az, + RefCountUpdate::ChangePreferredAzFrom(old_az), + ); + } + self.preferred_az_id = preferred_az; + } + } + + pub(crate) fn get_preferred_az(&self) -> Option<&AvailabilityZone> { + self.preferred_az_id.as_ref() + } } impl Drop for IntentState { @@ -566,6 +590,7 @@ impl TenantShard { last_error: Arc::default(), pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), + preferred_node: None, } } @@ -820,6 +845,15 @@ impl TenantShard { return None; }; + // If the candidate is our preferred node, then it is better than the current location, as long + // as it is online -- the online check is part of the score calculation we did above, so it's + // important that this check comes after that one. + if let Some(preferred) = self.preferred_node.as_ref() { + if preferred == &candidate { + return Some(true); + } + } + match scheduler.compute_node_score::( current, &self.intent.preferred_az_id, @@ -835,7 +869,9 @@ impl TenantShard { let current_score = current_score.for_optimization(); if candidate_score < current_score { - tracing::info!("Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})"); + tracing::info!( + "Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})" + ); Some(true) } else { // The candidate node is no better than our current location, so don't migrate @@ -856,13 +892,22 @@ impl TenantShard { } } - fn find_better_location( + pub(crate) fn find_better_location( &self, scheduler: &mut Scheduler, schedule_context: &ScheduleContext, current: NodeId, hard_exclude: &[NodeId], ) -> Option { + // If we have a migration hint, then that is our better location + if let Some(hint) = self.preferred_node.as_ref() { + if hint == ¤t { + return None; + } + + return Some(*hint); + } + // Look for a lower-scoring location to attach to let Ok(candidate_node) = scheduler.schedule_shard::( hard_exclude, @@ -896,6 +941,13 @@ impl TenantShard { scheduler: &mut Scheduler, schedule_context: &ScheduleContext, ) -> bool { + // Tenant with preferred node: check if it is not already at the preferred node + if let Some(preferred) = self.preferred_node.as_ref() { + if Some(preferred) != self.intent.get_attached().as_ref() { + return true; + } + } + // Sharded tenant: check if any locations have a nonzero affinity score if self.shard.count >= ShardCount(1) { let schedule_context = schedule_context.project_detach(self); @@ -936,6 +988,9 @@ impl TenantShard { /// Optimize attachments: if a shard has a secondary location that is preferable to /// its primary location based on soft constraints, switch that secondary location /// to be attached. + /// + /// `schedule_context` should have been populated with all shards in the tenant, including + /// the one we're trying to optimize (this function will subtract its own contribution before making scoring decisions) #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn optimize_attachment( &self, @@ -1005,7 +1060,7 @@ impl TenantShard { // most cases, even if some nodes are offline or have scheduling=pause set. debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this - // logic presumes we are in a mode where we want secondaries to be in non-home AZ + // logic presumes we are in a mode where we want secondaries to be in non-home AZ if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| { let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id; let is_available = secondary_scores @@ -1029,7 +1084,8 @@ impl TenantShard { } // Fall through: we didn't identify one to remove. This ought to be rare. - tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", + tracing::warn!( + "Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", self.intent.get_secondary() ); } else { @@ -1063,7 +1119,8 @@ impl TenantShard { // // This should be a transient state, there should always be capacity eventually in our preferred AZ (even if nodes // there are too overloaded for scheduler to suggest them, more should be provisioned eventually). - if self.intent.preferred_az_id.is_some() + if self.preferred_node.is_none() + && self.intent.preferred_az_id.is_some() && scheduler.get_node_az(&replacement) != self.intent.preferred_az_id { tracing::debug!( @@ -1169,6 +1226,27 @@ impl TenantShard { None } + /// Start or abort a graceful migration of this shard to another pageserver. This works on top of the + /// other optimisation functions, to bias them to move to the destination node. + pub(crate) fn set_preferred_node(&mut self, node: Option) { + if let Some(hint) = self.preferred_node.as_ref() { + if Some(hint) != node.as_ref() { + // This is legal but a bit surprising: we expect that administrators wouldn't usually + // change their mind about where to migrate something. + tracing::warn!( + "Changing migration destination from {hint} to {node:?} (current intent {:?})", + self.intent + ); + } + } + + self.preferred_node = node; + } + + pub(crate) fn get_preferred_node(&self) -> Option { + self.preferred_node + } + /// Return true if the optimization was really applied: it will not be applied if the optimization's /// sequence is behind this tenant shard's pub(crate) fn apply_optimization( @@ -1193,6 +1271,14 @@ impl TenantShard { self.intent.demote_attached(scheduler, old_attached_node_id); self.intent .promote_attached(scheduler, new_attached_node_id); + + if let Some(hint) = self.preferred_node.as_ref() { + if hint == &new_attached_node_id { + // The migration target is not a long term pin: once we are done with the migration, clear it. + tracing::info!("Graceful migration to {hint} complete"); + self.preferred_node = None; + } + } } ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { old_node_id, @@ -1711,6 +1797,10 @@ impl TenantShard { debug_assert!(!self.intent.all_pageservers().contains(&node_id)); + if self.preferred_node == Some(node_id) { + self.preferred_node = None; + } + intent_modified } @@ -1718,8 +1808,8 @@ impl TenantShard { self.scheduling_policy = p; } - pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy { - &self.scheduling_policy + pub(crate) fn get_scheduling_policy(&self) -> ShardSchedulingPolicy { + self.scheduling_policy } pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) { @@ -1758,6 +1848,7 @@ impl TenantShard { pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), + preferred_node: None, }) } @@ -1778,11 +1869,15 @@ impl TenantShard { } pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> { - self.intent.preferred_az_id.as_ref() + self.intent.get_preferred_az() } - pub(crate) fn set_preferred_az(&mut self, preferred_az_id: Option) { - self.intent.preferred_az_id = preferred_az_id; + pub(crate) fn set_preferred_az( + &mut self, + scheduler: &mut Scheduler, + preferred_az_id: Option, + ) { + self.intent.set_preferred_az(scheduler, preferred_az_id); } /// Returns all the nodes to which this tenant shard is attached according to the @@ -1798,8 +1893,8 @@ impl TenantShard { let conf = observed.conf.as_ref()?; match (conf.generation, conf.mode) { - (Some(gen), AttachedMulti | AttachedSingle | AttachedStale) => { - Some((*node_id, gen)) + (Some(gen_), AttachedMulti | AttachedSingle | AttachedStale) => { + Some((*node_id, gen_)) } _ => None, } @@ -1807,7 +1902,7 @@ impl TenantShard { .sorted_by(|(_lhs_node_id, lhs_gen), (_rhs_node_id, rhs_gen)| { lhs_gen.cmp(rhs_gen).reverse() }) - .map(|(node_id, gen)| (node_id, Generation::new(gen))) + .map(|(node_id, gen_)| (node_id, Generation::new(gen_))) .collect() } @@ -1839,7 +1934,10 @@ impl TenantShard { (Some(crnt), Some(new)) if crnt_gen > new_gen => { tracing::warn!( "Skipping observed state update {}: {:?} and using None due to stale generation ({} > {})", - node_id, loc, crnt, new + node_id, + loc, + crnt, + new ); self.observed @@ -1896,18 +1994,17 @@ impl Drop for TenantShard { #[cfg(test)] pub(crate) mod tests { - use std::{cell::RefCell, rc::Rc}; + use std::cell::RefCell; + use std::rc::Rc; - use pageserver_api::{ - controller_api::NodeAvailability, - shard::{ShardCount, ShardNumber}, - }; - use rand::{rngs::StdRng, SeedableRng}; + use pageserver_api::controller_api::NodeAvailability; + use pageserver_api::shard::{ShardCount, ShardNumber}; + use rand::SeedableRng; + use rand::rngs::StdRng; use utils::id::TenantId; - use crate::scheduler::test_utils::make_test_nodes; - use super::*; + use crate::scheduler::test_utils::make_test_nodes; fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { let tenant_id = TenantId::generate(); @@ -2085,16 +2182,20 @@ pub(crate) mod tests { // In pause mode, schedule() shouldn't do anything tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause; - assert!(tenant_shard - .schedule(&mut scheduler, &mut ScheduleContext::default()) - .is_ok()); + assert!( + tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok() + ); assert!(tenant_shard.intent.all_pageservers().is_empty()); // In active mode, schedule() works tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active; - assert!(tenant_shard - .schedule(&mut scheduler, &mut ScheduleContext::default()) - .is_ok()); + assert!( + tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok() + ); assert!(!tenant_shard.intent.all_pageservers().is_empty()); tenant_shard.intent.clear(&mut scheduler); @@ -2272,6 +2373,85 @@ pub(crate) mod tests { Ok(()) } + #[test] + /// How the optimisation code handles a shard with a preferred node set; this is an example + /// of the multi-step migration, but driven by a different input. + fn optimize_attachment_multi_preferred_node() -> anyhow::Result<()> { + let nodes = make_test_nodes( + 4, + &[ + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-b".to_string()), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + // Two shards of a tenant that wants to be in AZ A + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); + + // Initially attached in a stable location + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + + // Set the preferred node to node 2, an equally high scoring node to its current location + shard_a.preferred_node = Some(NodeId(2)); + + fn make_schedule_context(shard_a: &TenantShard) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context + } + + let schedule_context = make_schedule_context(&shard_a); + let optimization_a_prepare = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_prepare, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::CreateSecondary(NodeId(2)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + // The first step of the optimisation should not have cleared the preferred node + assert_eq!(shard_a.preferred_node, Some(NodeId(2))); + + let schedule_context = make_schedule_context(&shard_a); + let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_migrate, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(2) + }) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + // The cutover step of the optimisation should have cleared the preferred node + assert_eq!(shard_a.preferred_node, None); + + let schedule_context = make_schedule_context(&shard_a); + let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_cleanup, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(1)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + shard_a.intent.clear(&mut scheduler); + + Ok(()) + } + #[test] /// Check that multi-step migration works when moving to somewhere that is only better by /// 1 AffinityScore -- this ensures that we don't have a bug like the intermediate secondary @@ -2621,9 +2801,11 @@ pub(crate) mod tests { ); let mut schedule_context = ScheduleContext::default(); for shard in &mut shards { - assert!(shard - .schedule(&mut scheduler, &mut schedule_context) - .is_ok()); + assert!( + shard + .schedule(&mut scheduler, &mut schedule_context) + .is_ok() + ); } // Initial: attached locations land in the tenant's home AZ. diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index 609f3bf009..7f6544b894 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_scrubber" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index b42709868b..f0ba632fd4 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -1,12 +1,19 @@ use std::collections::{HashMap, HashSet}; use std::time::SystemTime; +use futures_util::StreamExt; use itertools::Itertools; +use pageserver::tenant::IndexPart; use pageserver::tenant::checks::check_valid_layermap; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::remote_timeline_client::manifest::TenantManifest; +use pageserver::tenant::remote_timeline_client::{ + parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, +}; +use pageserver::tenant::storage_layer::LayerName; use pageserver_api::shard::ShardIndex; +use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; @@ -15,14 +22,7 @@ use utils::shard::TenantShardId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; -use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; -use futures_util::StreamExt; -use pageserver::tenant::remote_timeline_client::{ - parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, -}; -use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; -use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use crate::{RootTarget, TenantShardTimelineId, download_object_with_retries}; pub(crate) struct TimelineAnalysis { /// Anomalies detected @@ -329,11 +329,11 @@ pub(crate) enum BlobDataParseResult { pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? - Some((layer_filename, gen)) if gen.len() == 8 => { + Some((layer_filename, gen_)) if gen_.len() == 8 => { let layer = layer_filename.parse::()?; - let gen = - Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; - Ok((layer, gen)) + let gen_ = + Generation::parse_suffix(gen_).ok_or("Malformed generation suffix".to_string())?; + Ok((layer, gen_)) } _ => Ok((name.parse::()?, Generation::none())), } @@ -423,9 +423,9 @@ async fn list_timeline_blobs_impl( tracing::info!("initdb archive preserved {key}"); } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { - Ok((new_layer, gen)) => { - tracing::debug!("Parsed layer key: {new_layer} {gen:?}"); - s3_layers.insert((new_layer, gen)); + Ok((new_layer, gen_)) => { + tracing::debug!("Parsed layer key: {new_layer} {gen_:?}"); + s3_layers.insert((new_layer, gen_)); } Err(e) => { tracing::info!("Error parsing {maybe_layer_name} as layer name: {e}"); @@ -465,7 +465,7 @@ async fn list_timeline_blobs_impl( .max_by_key(|i| i.1) .map(|(k, g)| (k.clone(), g)) { - Some((key, gen)) => (Some::(key.to_owned()), gen), + Some((key, gen_)) => (Some::(key.to_owned()), gen_), None => { // Legacy/missing case: one or zero index parts, which did not have a generation (index_part_keys.pop(), Generation::none()) @@ -521,7 +521,7 @@ async fn list_timeline_blobs_impl( }, unused_index_keys: index_part_keys, unknown_keys, - })) + })); } Err(index_parse_error) => errors.push(format!( "index_part.json body parsing error: {index_parse_error}" @@ -631,7 +631,7 @@ pub(crate) async fn list_tenant_manifests( .map(|(g, obj)| (*g, obj.clone())) .unwrap(); - manifests.retain(|(gen, _obj)| gen != &latest_generation); + manifests.retain(|(gen_, _obj)| gen_ != &latest_generation); let manifest_bytes = match download_object_with_retries(remote_client, &latest_listing_object.key).await { diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs index b1dfe3a53f..5cf286c662 100644 --- a/storage_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -3,11 +3,9 @@ use std::error::Error as _; use chrono::{DateTime, Utc}; use futures::Future; use hex::FromHex; - -use reqwest::{header, Client, StatusCode, Url}; +use reqwest::{Client, StatusCode, Url, header}; use serde::Deserialize; use tokio::sync::Semaphore; - use tokio_util::sync::CancellationToken; use utils::backoff; use utils::id::{TenantId, TimelineId}; diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs index 95d3af1453..efb05fb55e 100644 --- a/storage_scrubber/src/find_large_objects.rs +++ b/storage_scrubber/src/find_large_objects.rs @@ -5,10 +5,9 @@ use pageserver::tenant::storage_layer::LayerName; use remote_storage::ListingMode; use serde::{Deserialize, Serialize}; -use crate::{ - checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants, - stream_objects_with_retries, BucketConfig, NodeKind, -}; +use crate::checks::parse_layer_object_name; +use crate::metadata_stream::stream_tenants; +use crate::{BucketConfig, NodeKind, init_remote, stream_objects_with_retries}; #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] enum LargeObjectKind { diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index a4e5107e3d..e4f69a1669 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -3,11 +3,9 @@ //! Garbage means S3 objects which are either not referenced by any metadata, //! or are referenced by a control plane tenant/timeline in a deleted state. -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; use anyhow::Context; use futures_util::TryStreamExt; @@ -16,13 +14,14 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::TenantId}; +use utils::backoff; +use utils::id::TenantId; +use crate::cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix}; use crate::{ - cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, + BucketConfig, ConsoleConfig, MAX_RETRIES, NodeKind, TenantShardTimelineId, TraversingDepth, init_remote, list_objects_with_retries, - metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix}, - BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES, }; #[derive(Serialize, Deserialize, Debug)] @@ -259,14 +258,21 @@ async fn find_garbage_inner( .await?; if let Some(object) = tenant_objects.keys.first() { if object.key.get_path().as_str().ends_with("heatmap-v1.json") { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)" + ); garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); continue; } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + tracing::info!( + "Tenant {tenant_shard_id} is missing in console and contains one object: {}", + object.key + ); } } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran"); + tracing::info!( + "Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran" + ); } } else { // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial @@ -295,9 +301,13 @@ async fn find_garbage_inner( } if any_non_initdb { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb" + ); } else { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb" + ); garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); continue; } @@ -546,7 +556,9 @@ pub async fn purge_garbage( .any(|g| matches!(g.entity, GarbageEntity::Timeline(_))) && garbage_list.active_timeline_count == 0 { - anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines"); + anyhow::bail!( + "Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines" + ); } let filtered_items = garbage_list diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 224235098c..34e43fcc0b 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -17,15 +17,14 @@ use std::time::{Duration, SystemTime}; use anyhow::Context; use aws_config::retry::{RetryConfigBuilder, RetryMode}; +use aws_sdk_s3::Client; use aws_sdk_s3::config::Region; use aws_sdk_s3::error::DisplayErrorContext; -use aws_sdk_s3::Client; - use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; use futures::{Stream, StreamExt}; -use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver::tenant::TENANTS_SEGMENT_NAME; +use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver_api::shard::TenantShardId; use remote_storage::{ DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, @@ -38,7 +37,8 @@ use tokio::io::AsyncReadExt; use tokio_util::sync::CancellationToken; use tracing::{error, warn}; use tracing_appender::non_blocking::WorkerGuard; -use tracing_subscriber::{fmt, prelude::*, EnvFilter}; +use tracing_subscriber::prelude::*; +use tracing_subscriber::{EnvFilter, fmt}; use utils::fs_ext; use utils::id::{TenantId, TenantTimelineId, TimelineId}; @@ -411,10 +411,10 @@ async fn init_remote( let default_prefix = default_prefix_in_bucket(node_kind).to_string(); match &mut storage_config.0.storage { - RemoteStorageKind::AwsS3(ref mut config) => { + RemoteStorageKind::AwsS3(config) => { config.prefix_in_bucket.get_or_insert(default_prefix); } - RemoteStorageKind::AzureContainer(ref mut config) => { + RemoteStorageKind::AzureContainer(config) => { config.prefix_in_container.get_or_insert(default_prefix); } RemoteStorageKind::LocalFs { .. } => (), diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index fa6ee90b66..fb2ab02565 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,24 +1,20 @@ -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow, bail}; use camino::Utf8PathBuf; +use clap::{Parser, Subcommand}; use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; use reqwest::{Method, Url}; use storage_controller_client::control_api; -use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; -use storage_scrubber::pageserver_physical_gc::GcMode; +use storage_scrubber::garbage::{PurgeMode, find_garbage, purge_garbage}; +use storage_scrubber::pageserver_physical_gc::{GcMode, pageserver_physical_gc}; use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata; -use storage_scrubber::scan_safekeeper_metadata::DatabaseOrList; +use storage_scrubber::scan_safekeeper_metadata::{DatabaseOrList, scan_safekeeper_metadata}; use storage_scrubber::tenant_snapshot::SnapshotDownloader; -use storage_scrubber::{find_large_objects, ControllerClientConfig}; use storage_scrubber::{ - init_logging, pageserver_physical_gc::pageserver_physical_gc, - scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, - TraversingDepth, + BucketConfig, ConsoleConfig, ControllerClientConfig, NodeKind, TraversingDepth, + find_large_objects, init_logging, }; - -use clap::{Parser, Subcommand}; use utils::id::TenantId; - use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); @@ -173,15 +169,23 @@ async fn main() -> anyhow::Result<()> { if let NodeKind::Safekeeper = node_kind { let db_or_list = match (timeline_lsns, dump_db_connstr) { (Some(timeline_lsns), _) => { - let timeline_lsns = serde_json::from_str(&timeline_lsns).context("parsing timeline_lsns")?; + let timeline_lsns = serde_json::from_str(&timeline_lsns) + .context("parsing timeline_lsns")?; DatabaseOrList::List(timeline_lsns) } (None, Some(dump_db_connstr)) => { - let dump_db_table = dump_db_table.ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; + let dump_db_table = dump_db_table + .ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; let tenant_ids = tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(); - DatabaseOrList::Database { tenant_ids, connstr: dump_db_connstr, table: dump_db_table } + DatabaseOrList::Database { + tenant_ids, + connstr: dump_db_connstr, + table: dump_db_table, + } } - (None, None) => anyhow::bail!("neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"), + (None, None) => anyhow::bail!( + "neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`" + ), }; let summary = scan_safekeeper_metadata(bucket_config.clone(), db_or_list).await?; if json { @@ -371,7 +375,9 @@ pub async fn scan_pageserver_metadata_cmd( exit_code: bool, ) -> anyhow::Result<()> { if controller_client.is_none() && post_to_storcon { - return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run")); + return Err(anyhow!( + "Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run" + )); } match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await { Err(e) => { diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index 47447d681c..af2407856d 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -1,17 +1,17 @@ use std::str::FromStr; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use async_stream::{stream, try_stream}; use futures::StreamExt; +use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use tokio_stream::Stream; +use utils::id::{TenantId, TimelineId}; use crate::{ - list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target, - TenantShardTimelineId, + RootTarget, S3Target, TenantShardTimelineId, list_objects_with_retries, + stream_objects_with_retries, }; -use pageserver_api::shard::TenantShardId; -use utils::id::{TenantId, TimelineId}; /// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes pub fn stream_tenants<'a>( diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 063c6bcfb9..f14341c7bc 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -2,22 +2,16 @@ use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::time::Duration; -use crate::checks::{ - list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult, - RemoteTenantManifestInfo, -}; -use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES}; use async_stream::try_stream; use futures::future::Either; use futures_util::{StreamExt, TryStreamExt}; +use pageserver::tenant::IndexPart; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest; use pageserver::tenant::remote_timeline_client::{ parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, }; use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; use pageserver_api::controller_api::TenantDescribeResponse; use pageserver_api::shard::{ShardIndex, TenantShardId}; use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; @@ -25,11 +19,18 @@ use reqwest::Method; use serde::Serialize; use storage_controller_client::control_api; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; +use tracing::{Instrument, info_span}; use utils::backoff; use utils::generation::Generation; use utils::id::{TenantId, TenantTimelineId}; +use crate::checks::{ + BlobDataParseResult, ListTenantManifestResult, RemoteTenantManifestInfo, list_tenant_manifests, + list_timeline_blobs, +}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{BucketConfig, MAX_RETRIES, NodeKind, RootTarget, TenantShardTimelineId, init_remote}; + #[derive(Serialize, Default)] pub struct GcSummary { indices_deleted: usize, @@ -151,10 +152,8 @@ impl TenantRefAccumulator { } } - if !ancestor_refs.is_empty() { - tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len()); - self.ancestor_ref_shards.update(ttid, ancestor_refs); - } + tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len()); + self.ancestor_ref_shards.update(ttid, ancestor_refs); } /// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve @@ -778,7 +777,7 @@ pub async fn pageserver_physical_gc( let mut summary = GcSummary::default(); { - let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); + let timelines = timelines.try_buffered(CONCURRENCY); let timelines = timelines.try_flatten(); let timelines = timelines.map_ok(|(ttid, tenant_manifest_arc)| { @@ -792,8 +791,8 @@ pub async fn pageserver_physical_gc( tenant_manifest_arc, ) }); - let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); - + let timelines = timelines.try_buffered(CONCURRENCY); + let mut timelines = std::pin::pin!(timelines); // Drain futures for per-shard GC, populating accumulator as a side effect while let Some(i) = timelines.next().await { summary.merge(i?); diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index a31fb5b242..ba75f25984 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -1,21 +1,22 @@ use std::collections::{HashMap, HashSet}; -use crate::checks::{ - branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, - RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis, -}; -use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::remote_layer_path; use pageserver_api::controller_api::MetadataHealthUpdateRequest; use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use serde::Serialize; -use tracing::{info_span, Instrument}; +use tracing::{Instrument, info_span}; use utils::id::TenantId; use utils::shard::ShardCount; +use crate::checks::{ + BlobDataParseResult, RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis, + branch_cleanup_and_check_errors, list_timeline_blobs, +}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, init_remote}; + #[derive(Serialize, Default)] pub struct MetadataSummary { tenant_count: usize, diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 0a4d4266a0..f10d758097 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -1,23 +1,24 @@ -use std::{collections::HashSet, str::FromStr, sync::Arc}; +use std::collections::HashSet; +use std::str::FromStr; +use std::sync::Arc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use futures::stream::{StreamExt, TryStreamExt}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; -use postgres_ffi::{XLogFileName, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName}; use remote_storage::GenericRemoteStorage; use rustls::crypto::ring; use serde::Serialize; use tokio_postgres::types::PgLsn; use tracing::{debug, error, info}; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use crate::cloud_admin_api::CloudAdminApiClient; +use crate::metadata_stream::stream_listing; use crate::{ - cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, init_remote, }; /// Generally we should ask safekeepers, but so far we use everywhere default 16MB. diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 60e79fb859..e17409c20e 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -1,25 +1,26 @@ use std::collections::HashMap; use std::sync::Arc; -use crate::checks::{list_timeline_blobs, BlobDataParseResult, RemoteTimelineBlobData}; -use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; -use crate::{ - download_object_to_file_s3, init_remote, init_remote_s3, BucketConfig, NodeKind, RootTarget, - TenantShardTimelineId, -}; use anyhow::Context; use async_stream::stream; use aws_sdk_s3::Client; use camino::Utf8PathBuf; use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::IndexPart; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, S3Config}; use utils::generation::Generation; use utils::id::TenantId; +use crate::checks::{BlobDataParseResult, RemoteTimelineBlobData, list_timeline_blobs}; +use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; +use crate::{ + BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, download_object_to_file_s3, + init_remote, init_remote_s3, +}; + pub struct SnapshotDownloader { s3_client: Arc, s3_root: RootTarget, diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 425abef935..205b9141e0 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -19,7 +19,7 @@ if TYPE_CHECKING: class ComputeReconfigure: def __init__(self, server: HTTPServer): self.server = server - self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" + self.control_plane_hooks_api = f"http://{server.host}:{server.port}/" self.workloads: dict[TenantId, Any] = {} self.on_notify: Callable[[Any], None] | None = None diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index cdc162fca2..9b28246f58 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -53,6 +53,18 @@ class EndpointHttpClient(requests.Session): res.raise_for_status() return res.text + # Current compute status. + def status(self): + res = self.get(f"http://localhost:{self.external_port}/status") + res.raise_for_status() + return res.json() + + # Compute startup-related metrics. + def metrics_json(self): + res = self.get(f"http://localhost:{self.external_port}/metrics.json") + res.raise_for_status() + return res.json() + def configure_failpoints(self, *args: tuple[str, str]) -> None: body: list[dict[str, str]] = [] diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 83a1a87611..54e6458ac6 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -175,6 +175,9 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( counter("pageserver_tenant_throttling_count"), counter("pageserver_timeline_wal_records_received"), counter("pageserver_page_service_pagestream_flush_in_progress_micros"), + counter("pageserver_wait_lsn_in_progress_micros"), + counter("pageserver_wait_lsn_started_count"), + counter("pageserver_wait_lsn_finished_count"), *histogram("pageserver_page_service_batch_size"), *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"), *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 97a5a36814..6e53987e7c 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -525,12 +525,14 @@ class NeonLocalCli(AbstractNeonCli): def endpoint_start( self, endpoint_id: str, + safekeepers_generation: int | None = None, safekeepers: list[int] | None = None, remote_ext_config: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, create_test_user: bool = False, basebackup_request_tries: int | None = None, + timeout: str | None = None, env: dict[str, str] | None = None, ) -> subprocess.CompletedProcess[str]: args = [ @@ -543,6 +545,8 @@ class NeonLocalCli(AbstractNeonCli): if remote_ext_config is not None: args.extend(["--remote-ext-config", remote_ext_config]) + if safekeepers_generation is not None: + args.extend(["--safekeepers-generation", str(safekeepers_generation)]) if safekeepers is not None: args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) if endpoint_id is not None: @@ -553,6 +557,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--allow-multiple"]) if create_test_user: args.extend(["--create-test-user"]) + if timeout is not None: + args.extend(["--start-timeout", str(timeout)]) res = self.raw_cli(args, extra_env_vars) res.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1d282971b1..deff02f0f9 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -253,10 +253,15 @@ class PgProtocol: # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. + # pooler does not support statement_timeout + # Check if the hostname contains the string 'pooler' + hostname = result.get("host", "") + log.info(f"Hostname: {hostname}") options = result.get("options", "") - if "statement_timeout" not in options: + if "statement_timeout" not in options and "pooler" not in hostname: options = f"-cstatement_timeout=120s {options}" result["options"] = options + return result # autocommit=True here by default because that's what we need most of the time @@ -455,9 +460,16 @@ class NeonEnvBuilder: self.overlay_mounts_created_by_us: list[tuple[str, Path]] = [] self.config_init_force: str | None = None self.top_output_dir = top_output_dir - self.control_plane_compute_hook_api: str | None = None + self.control_plane_hooks_api: str | None = None self.storage_controller_config: dict[Any, Any] | None = None + # Flag to enable https listener in pageserver, generate local ssl certs, + # and force storage controller to use https for pageserver api. + self.use_https_pageserver_api: bool = False + # Flag to enable https listener in safekeeper, generate local ssl certs, + # and force storage controller to use https for safekeeper api. + self.use_https_safekeeper_api: bool = False + self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine self.pageserver_get_vectored_concurrent_io: str | None = ( pageserver_get_vectored_concurrent_io @@ -1054,6 +1066,13 @@ class NeonEnv: self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline + self.generate_local_ssl_certs = ( + config.use_https_pageserver_api or config.use_https_safekeeper_api + ) + self.ssl_ca_file = ( + self.repo_dir.joinpath("rootCA.crt") if self.generate_local_ssl_certs else None + ) + neon_local_env_vars = {} if self.rust_log_override is not None: neon_local_env_vars["RUST_LOG"] = self.rust_log_override @@ -1102,7 +1121,7 @@ class NeonEnv: self.control_plane_api: str = self.storage_controller.upcall_api_endpoint() # For testing this with a fake HTTP server, enable passing through a URL from config - self.control_plane_compute_hook_api = config.control_plane_compute_hook_api + self.control_plane_hooks_api = config.control_plane_hooks_api self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode @@ -1117,16 +1136,27 @@ class NeonEnv: }, "safekeepers": [], "pageservers": [], + "generate_local_ssl_certs": self.generate_local_ssl_certs, } if self.control_plane_api is not None: cfg["control_plane_api"] = self.control_plane_api - if self.control_plane_compute_hook_api is not None: - cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api + if self.control_plane_hooks_api is not None: + cfg["control_plane_hooks_api"] = self.control_plane_hooks_api - if self.storage_controller_config is not None: - cfg["storage_controller"] = self.storage_controller_config + storage_controller_config = self.storage_controller_config + + if config.use_https_pageserver_api: + storage_controller_config = storage_controller_config or {} + storage_controller_config["use_https_pageserver_api"] = True + + if config.use_https_safekeeper_api: + storage_controller_config = storage_controller_config or {} + storage_controller_config["use_https_safekeeper_api"] = True + + if storage_controller_config is not None: + cfg["storage_controller"] = storage_controller_config # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -1137,6 +1167,7 @@ class NeonEnv: pageserver_port = PageserverPort( pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), + https=self.port_distributor.get_port() if config.use_https_pageserver_api else None, ) # Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override` @@ -1151,12 +1182,17 @@ class NeonEnv: "id": ps_id, "listen_pg_addr": f"localhost:{pageserver_port.pg}", "listen_http_addr": f"localhost:{pageserver_port.http}", + "listen_https_addr": f"localhost:{pageserver_port.https}" + if config.use_https_pageserver_api + else None, "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, "availability_zone": availability_zone, # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids # the pageserver taking a long time to start up due to syncfs flushing other tests' data "no_sync": True, + # Look for gaps in WAL received from safekeepeers + "validate_wal_contiguity": True, } # Batching (https://github.com/neondatabase/neon/issues/9377): @@ -1167,14 +1203,6 @@ class NeonEnv: "max_batch_size": 32, } - if config.test_may_use_compatibility_snapshot_binaries: - log.info( - "Skipping WAL contiguity validation to avoid forward-compatibility related test failures" - ) - else: - # Look for gaps in WAL received from safekeepeers - ps_cfg["validate_wal_contiguity"] = True - get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io if get_vectored_concurrent_io is not None: ps_cfg["get_vectored_concurrent_io"] = { @@ -1189,6 +1217,9 @@ class NeonEnv: config.pageserver_default_tenant_config_compaction_algorithm ) + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["rel_size_v2_enabled"] = True # Enable relsize_v2 by default in tests + if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( self.pageserver_remote_storage @@ -1226,6 +1257,7 @@ class NeonEnv: pg=self.port_distributor.get_port(), pg_tenant_only=self.port_distributor.get_port(), http=self.port_distributor.get_port(), + https=self.port_distributor.get_port() if config.use_https_safekeeper_api else None, ) id = config.safekeepers_id_start + i # assign ids sequentially sk_cfg: dict[str, Any] = { @@ -1233,6 +1265,7 @@ class NeonEnv: "pg_port": port.pg, "pg_tenant_only_port": port.pg_tenant_only, "http_port": port.http, + "https_port": port.https, "sync": config.safekeepers_enable_fsync, } if config.auth_enabled: @@ -1713,8 +1746,12 @@ class StorageControllerLeadershipStatus(StrEnum): @dataclass class StorageControllerMigrationConfig: - secondary_warmup_timeout: str | None - secondary_download_request_timeout: str | None + # Unlike the API itself, tests default to prewarm=False because it's a simpler API and doesn't + # require the test to go poll for the migration actually completing. + prewarm: bool = False + override_scheduler: bool = False + secondary_warmup_timeout: str | None = None + secondary_download_request_timeout: str | None = None class NeonStorageController(MetricsGetter, LogUtils): @@ -2118,8 +2155,10 @@ class NeonStorageController(MetricsGetter, LogUtils): config: StorageControllerMigrationConfig | None = None, ): payload = {"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id} - if config is not None: - payload["migration_config"] = dataclasses.asdict(config) + if config is None: + config = StorageControllerMigrationConfig() + + payload["migration_config"] = dataclasses.asdict(config) self.request( "PUT", @@ -2127,8 +2166,13 @@ class NeonStorageController(MetricsGetter, LogUtils): json=payload, headers=self.headers(TokenScope.ADMIN), ) - log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") - assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id + if config.prewarm: + log.info( + f"Started prewarm migration of tenant {tenant_shard_id} to pageserver {dest_ps_id}" + ) + else: + log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") + assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id def tenant_policy_update(self, tenant_id: TenantId, body: dict[str, Any]): log.info(f"tenant_policy_update({tenant_id}, {body})") @@ -2469,12 +2513,21 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] - def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId): + def download_heatmap_layers( + self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, recurse: bool | None = None + ): + url = ( + f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers" + ) + if recurse is not None: + url = url + f"?recurse={str(recurse).lower()}" + response = self.request( "POST", - f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", + url, headers=self.headers(TokenScope.ADMIN), ) + response.raise_for_status() def __enter__(self) -> Self: @@ -3592,6 +3645,7 @@ class NeonProxy(PgProtocol): "project_id": "test_project_id", "endpoint_id": "test_endpoint_id", "branch_id": "test_branch_id", + "compute_id": "test_compute_id", }, } }, @@ -3817,6 +3871,7 @@ def static_auth_broker( { "address": local_proxy_addr, "aux": { + "compute_id": "compute-foo-bar-1234-5678", "endpoint_id": "ep-foo-bar-1234", "branch_id": "br-foo-bar", "project_id": "foo-bar", @@ -3987,10 +4042,12 @@ class Endpoint(PgProtocol, LogUtils): self, remote_ext_config: str | None = None, pageserver_id: int | None = None, + safekeeper_generation: int | None = None, safekeepers: list[int] | None = None, allow_multiple: bool = False, create_test_user: bool = False, basebackup_request_tries: int | None = None, + timeout: str | None = None, env: dict[str, str] | None = None, ) -> Self: """ @@ -4000,19 +4057,21 @@ class Endpoint(PgProtocol, LogUtils): assert self.endpoint_id is not None - # If `safekeepers` is not None, they are remember them as active and use - # in the following commands. + # If `safekeepers` is not None, remember them as active and use in the + # following commands. if safekeepers is not None: self.active_safekeepers = safekeepers self.env.neon_cli.endpoint_start( self.endpoint_id, + safekeepers_generation=safekeeper_generation, safekeepers=self.active_safekeepers, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, allow_multiple=allow_multiple, create_test_user=create_test_user, basebackup_request_tries=basebackup_request_tries, + timeout=timeout, env=env, ) self._running.release(1) @@ -4427,6 +4486,7 @@ class SafekeeperPort: pg: int pg_tenant_only: int http: int + https: int | None @dataclass @@ -4523,33 +4583,6 @@ class Safekeeper(LogUtils): for na in not_allowed: assert not self.log_contains(na) - def append_logical_message( - self, tenant_id: TenantId, timeline_id: TimelineId, request: dict[str, Any] - ) -> dict[str, Any]: - """ - Send JSON_CTRL query to append LogicalMessage to WAL and modify - safekeeper state. It will construct LogicalMessage from provided - prefix and message, and then will write it to WAL. - """ - - # "replication=0" hacks psycopg not to send additional queries - # on startup, see https://github.com/psycopg/psycopg2/pull/482 - token = self.env.auth_keys.generate_tenant_token(tenant_id) - connstr = f"host=localhost port={self.port.pg} password={token} replication=0 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" - - with closing(psycopg2.connect(connstr)) as conn: - # server doesn't support transactions - conn.autocommit = True - with conn.cursor() as cur: - request_json = json.dumps(request) - log.info(f"JSON_CTRL request on port {self.port.pg}: {request_json}") - cur.execute("JSON_CTRL " + request_json) - all = cur.fetchall() - log.info(f"JSON_CTRL response: {all[0][0]}") - res = json.loads(all[0][0]) - assert isinstance(res, dict) - return res - def http_client( self, auth_token: str | None = None, gen_sk_wide_token: bool = True ) -> SafekeeperHttpClient: diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 748ac0d569..abddfa2768 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -94,7 +94,9 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*Flushed oversized open layer with size.*", # During teardown, we stop the storage controller before the pageservers, so pageservers # can experience connection errors doing background deletion queue work. - ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*", + ".*WARN deletion backend:.* storage controller upcall failed, will retry.*error sending request.*", + # Can happen when the pageserver starts faster than the storage controller + ".*WARN init_tenant_mgr:.* storage controller upcall failed, will retry.*error sending request.*", # Can happen when the test shuts down the storage controller while it is calling the utilization API ".*WARN.*path=/v1/utilization .*request was dropped before completing", # Can happen during shutdown @@ -122,6 +124,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ # controller's attempts to notify the endpoint). ".*reconciler.*neon_local notification hook failed.*", ".*reconciler.*neon_local error.*", + # Tenant rate limits may fire in tests that submit lots of API requests. + ".*tenant \\S+ is rate limited.*", ] diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 364aff325d..13cab448f3 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -375,6 +375,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params) self.verbose_error(res) + def timeline_patch_index_part( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + data: dict[str, Any], + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/patch_index_part", + json=data, + ) + self.verbose_error(res) + return res.json() + def tenant_location_conf( self, tenant_id: TenantId | TenantShardId, @@ -1057,11 +1070,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, batch_size: int | None = None, + detach_behavior: str | None = None, **kwargs, ) -> set[TimelineId]: - params = {} + params: dict[str, Any] = {} if batch_size is not None: params["batch_size"] = batch_size + if detach_behavior: + params["detach_behavior"] = detach_behavior res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", params=params, diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 4df2b2df2b..cac84c07e7 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -282,6 +282,17 @@ class S3Storage: def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + def safekeeper_tenants_path(self) -> str: + return f"{self.prefix_in_bucket}" + + def safekeeper_tenant_path(self, tenant_id: TenantShardId | TenantId) -> str: + return f"{self.safekeeper_tenants_path()}/{tenant_id}" + + def safekeeper_timeline_path( + self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId + ) -> str: + return f"{self.safekeeper_tenant_path(tenant_id)}/{timeline_id}" + def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str: """ Gets the latest generation key from a list of keys. diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 493ce7334e..e409151b76 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -229,13 +229,14 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): # only_local doesn't remove segments in the remote storage. def timeline_delete( - self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False + self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False, **kwargs ) -> dict[Any, Any]: res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", params={ "only_local": str(only_local).lower(), }, + **kwargs, ) res.raise_for_status() res_json = res.json() @@ -273,10 +274,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def timeline_exclude( + self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + ) -> dict[str, Any]: + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude", + data=to.to_json(), + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def membership_switch( self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration ) -> TimelineMembershipSwitchResponse: - res = self.post( + res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership", data=to.to_json(), ) diff --git a/test_runner/fixtures/safekeeper_utils.py b/test_runner/fixtures/safekeeper_utils.py new file mode 100644 index 0000000000..158baf7bb6 --- /dev/null +++ b/test_runner/fixtures/safekeeper_utils.py @@ -0,0 +1,92 @@ +from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonPageserver, Safekeeper +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.utils import get_dir_size + + +def is_segment_offloaded( + sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn +): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.backup_lsn >= seg_end + + +def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.flush_lsn >= lsn + + +def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id)) + sk_wal_size_mb = sk_wal_size / 1024 / 1024 + log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") + return sk_wal_size_mb <= target_size_mb + + +def wait_lsn_force_checkpoint( + tenant_id: TenantId, + timeline_id: TimelineId, + endpoint: Endpoint, + ps: NeonPageserver, + pageserver_conn_options=None, +): + pageserver_conn_options = pageserver_conn_options or {} + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") + + wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at_sk( + safekeeper: Safekeeper, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) + wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at( + lsn: Lsn, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + """ + Wait until pageserver receives given lsn, force checkpoint and wait for + upload, i.e. remote_consistent_lsn advancement. + """ + pageserver_conn_options = pageserver_conn_options or {} + + auth_token = None + if "password" in pageserver_conn_options: + auth_token = pageserver_conn_options["password"] + + # wait for the pageserver to catch up + wait_for_last_record_lsn( + ps.http_client(auth_token=auth_token), + tenant_id, + timeline_id, + lsn, + ) + + # force checkpoint to advance remote_consistent_lsn + ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) + + # ensure that remote_consistent_lsn is advanced + wait_for_upload( + ps.http_client(auth_token=auth_token), + tenant_id, + timeline_id, + lsn, + ) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 84d62fb877..d1b2a5a400 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -337,6 +337,8 @@ def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, e """ # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build endpoint_id, region_id, _ = host.split(".", 2) + # Remove "-pooler" suffix if present + endpoint_id = endpoint_id.removesuffix("-pooler") params = { "orgId": 1, diff --git a/test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql b/test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql new file mode 100644 index 0000000000..4c5b3fbd11 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql @@ -0,0 +1,162 @@ +\set min_id 1 +\set max_id 1500000000 +\set range_size 100 + +-- Use uniform random instead of random_zipfian +\set random_id random(:min_id, :max_id) +\set random_mar_id random(1, 65536) +\set random_delete_id random(:min_id, :max_id) + +-- Update exactly one row (if it exists) using the uniformly chosen random_id +UPDATE transaction.transaction + SET state = 'COMPLETED', + settlement_date = CURRENT_DATE, + mar_identifier = (:random_mar_id)::int + WHERE id = (:random_id)::bigint; + +-- Insert exactly one row +INSERT INTO transaction.transaction ( + user_id, + card_id, + business_id, + preceding_transaction_id, + is_last, + is_mocked, + type, + state, + network, + subnetwork, + user_transaction_time, + settlement_date, + request_amount, + amount, + currency_code, + approval_code, + response, + gpa, + gpa_order_unload, + gpa_order, + program_transfer, + fee_transfer, + peer_transfer, + msa_orders, + risk_assessment, + auto_reload, + direct_deposit, + polarity, + real_time_fee_group, + fee, + chargeback, + standin_approved_by, + acquirer_fee_amount, + funded_account_holder, + digital_wallet_token, + network_fees, + card_security_code_verification, + fraud, + cardholder_authentication_data, + currency_conversion, + merchant, + store, + card_acceptor, + acquirer, + pos, + avs, + mar_token, + mar_preceding_related_transaction_token, + mar_business_token, + mar_acting_user_token, + mar_card_token, + mar_duration, + mar_created_time, + issuer_interchange_amount, + offer_orders, + transaction_canonical_id, + mar_identifier, + created_at, + card_acceptor_mid, + card_acceptor_name, + address_verification, + issuing_product, + mar_enhanced_data_token, + standin_reason +) +SELECT + (:random_id % 100000) + 1 AS user_id, + (:random_id % 500000) + 1 AS card_id, + (:random_id % 20000) + 1 AS business_id, + NULL AS preceding_transaction_id, + (:random_id % 2) = 0 AS is_last, + (:random_id % 5) = 0 AS is_mocked, + 'authorization' AS type, + 'PENDING' AS state, + 'VISA' AS network, + 'VISANET' AS subnetwork, + now() - ((:random_id % 100) || ' days')::interval AS user_transaction_time, + now() - ((:random_id % 100) || ' days')::interval AS settlement_date, + random() * 1000 AS request_amount, + random() * 1000 AS amount, + 'USD' AS currency_code, + md5((:random_id)::text) AS approval_code, + '{}'::jsonb AS response, + '{}'::jsonb AS gpa, + '{}'::jsonb AS gpa_order_unload, + '{}'::jsonb AS gpa_order, + '{}'::jsonb AS program_transfer, + '{}'::jsonb AS fee_transfer, + '{}'::jsonb AS peer_transfer, + '{}'::jsonb AS msa_orders, + '{}'::jsonb AS risk_assessment, + '{}'::jsonb AS auto_reload, + '{}'::jsonb AS direct_deposit, + '{}'::jsonb AS polarity, + '{}'::jsonb AS real_time_fee_group, + '{}'::jsonb AS fee, + '{}'::jsonb AS chargeback, + NULL AS standin_approved_by, + random() * 100 AS acquirer_fee_amount, + '{}'::jsonb AS funded_account_holder, + '{}'::jsonb AS digital_wallet_token, + '{}'::jsonb AS network_fees, + '{}'::jsonb AS card_security_code_verification, + '{}'::jsonb AS fraud, + '{}'::jsonb AS cardholder_authentication_data, + '{}'::jsonb AS currency_conversion, + '{}'::jsonb AS merchant, + '{}'::jsonb AS store, + '{}'::jsonb AS card_acceptor, + '{}'::jsonb AS acquirer, + '{}'::jsonb AS pos, + '{}'::jsonb AS avs, + md5((:random_id)::text || 'token') AS mar_token, + NULL AS mar_preceding_related_transaction_token, + NULL AS mar_business_token, + NULL AS mar_acting_user_token, + NULL AS mar_card_token, + random() * 1000 AS mar_duration, + now() AS mar_created_time, + random() * 100 AS issuer_interchange_amount, + '{}'::jsonb AS offer_orders, + (:random_id % 500) + 1 AS transaction_canonical_id, + :random_id::integer AS mar_identifier, + now() AS created_at, + NULL AS card_acceptor_mid, + NULL AS card_acceptor_name, + '{}'::jsonb AS address_verification, + 'DEFAULT_PRODUCT' AS issuing_product, + NULL AS mar_enhanced_data_token, + NULL AS standin_reason +FROM (SELECT 1) AS dummy; + +-- Delete exactly one row using the uniformly chosen random_delete_id +WITH to_delete AS ( + SELECT id + FROM transaction.transaction + WHERE id >= (:random_delete_id)::bigint + AND id < ((:random_delete_id)::bigint + :range_size) + ORDER BY id + LIMIT 1 +) +DELETE FROM transaction.transaction +USING to_delete +WHERE transaction.transaction.id = to_delete.id; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql new file mode 100644 index 0000000000..69e6366a53 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql @@ -0,0 +1,47 @@ +\set event_type random(1,10) +\set service_key random(1, 3) + +INSERT INTO webhook.incoming_webhooks ( + created_at, + delivery_id, + upstream_emitted_at, + service_key, + event_id, + source, + body, + json, + additional_data, + is_body_encrypted, + event_type +) VALUES ( + now(), + gen_random_uuid(), + now() - interval '10 minutes', + CASE :service_key::int + WHEN 1 THEN 'shopify' + WHEN 2 THEN 'stripe' + WHEN 3 THEN 'github' + END, + 'evt_' || gen_random_uuid(), -- Ensures uniqueness + CASE :service_key::int + WHEN 1 THEN 'Shopify' + WHEN 2 THEN 'Stripe' + WHEN 3 THEN 'GitHub' + END, + '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}', + '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}'::jsonb, + '{"metadata": {"user_agent": "Mozilla/5.0", "ip_address": "203.0.113.42"}}'::jsonb, + false, + CASE :event_type::int + WHEN 1 THEN 'ORDER_PLACED' + WHEN 2 THEN 'ORDER_CANCELLED' + WHEN 3 THEN 'PAYMENT_SUCCESSFUL' + WHEN 4 THEN 'PAYMENT_FAILED' + WHEN 5 THEN 'CUSTOMER_CREATED' + WHEN 6 THEN 'CUSTOMER_UPDATED' + WHEN 7 THEN 'PRODUCT_UPDATED' + WHEN 8 THEN 'INVENTORY_LOW' + WHEN 9 THEN 'SHIPPING_DISPATCHED' + WHEN 10 THEN 'REFUND_ISSUED' + END +); \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql new file mode 100644 index 0000000000..b2f173f011 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql @@ -0,0 +1,15 @@ +-- Zipfian distributions model real-world access patterns where: +-- A few values (popular IDs) are accessed frequently. +-- Many values are accessed rarely. +-- This is useful for simulating realistic workloads, like webhook processing where recent events are more frequently accessed. + +\set alpha 1.2 +\set min_id 1 +\set max_id 135000000 + +\set zipf_random_id random_zipfian(:min_id, :max_id, :alpha) + +SELECT * +FROM webhook.incoming_webhooks +WHERE id = (:zipf_random_id)::bigint +LIMIT 1; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql b/test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql new file mode 100644 index 0000000000..e0b0e52276 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql @@ -0,0 +1,25 @@ +-- enforce a controlled number of getpages prefetch requests from a range of +-- 40 million first pages (320 GB) of a 500 GiB table +-- the table has 55 million pages + + +-- Zipfian distributions model real-world access patterns where: +-- A few values (popular IDs) are accessed frequently. +-- Many values are accessed rarely. +-- This is useful for simulating realistic workloads + +\set alpha 1.2 +\set min_page 1 +\set max_page 40000000 + +\set zipf_random_page random_zipfian(:min_page, :max_page, :alpha) + +-- Read 500 consecutive pages from a Zipfian-distributed random start page +-- This enforces PostgreSQL prefetching +WITH random_page AS ( + SELECT :zipf_random_page::int AS start_page +) +SELECT MAX(created_at) +FROM webhook.incoming_webhooks +WHERE ctid >= (SELECT format('(%s,1)', start_page)::tid FROM random_page) +AND ctid < (SELECT format('(%s,1)', start_page + 500)::tid FROM random_page); \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql new file mode 100644 index 0000000000..78a843bf0f --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql @@ -0,0 +1,9 @@ +-- select one of the most recent webhook records (created in the branch timeline during the bench run) +SELECT * +FROM webhook.incoming_webhooks +WHERE id = ( + SELECT (floor(random() * ( + (SELECT last_value FROM webhook.incoming_webhooks_id_seq) - 1350000001 + 1 + ) + 1350000001))::bigint +) +LIMIT 1; \ No newline at end of file diff --git a/test_runner/performance/test_compute_ctl_api.py b/test_runner/performance/test_compute_ctl_api.py new file mode 100644 index 0000000000..d6d0a84e8e --- /dev/null +++ b/test_runner/performance/test_compute_ctl_api.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import datetime + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.neon_fixtures import NeonEnv + + +@pytest.mark.timeout(120) +def test_compute_ctl_api_latencies( + neon_simple_env: NeonEnv, + zenbenchmark: NeonBenchmarker, +): + """ + Test compute_ctl HTTP API performance. Do simple GET requests + to catch any pathological degradations in the HTTP server. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + client = endpoint.http_client() + + NUM_REQUESTS = 10000 + + status_response_latency_us = [] + metrics_response_latency_us = [] + + for _i in range(NUM_REQUESTS): + start_time = datetime.datetime.now() + _ = client.status() + status_response_latency_us.append((datetime.datetime.now() - start_time).microseconds) + + start_time = datetime.datetime.now() + _ = client.metrics_json() + metrics_response_latency_us.append((datetime.datetime.now() - start_time).microseconds) + + status_response_latency_us = sorted(status_response_latency_us) + metrics_response_latency_us = sorted(metrics_response_latency_us) + + zenbenchmark.record( + "status_response_latency_p50_us", + status_response_latency_us[len(status_response_latency_us) // 2], + "μs", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "metrics_response_latency_p50_us", + metrics_response_latency_us[len(metrics_response_latency_us) // 2], + "μs", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "status_response_latency_p99_us", + status_response_latency_us[len(status_response_latency_us) * 99 // 100], + "μs", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "metrics_response_latency_p99_us", + metrics_response_latency_us[len(metrics_response_latency_us) * 99 // 100], + "μs", + MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/test_cumulative_statistics_persistence.py b/test_runner/performance/test_cumulative_statistics_persistence.py new file mode 100644 index 0000000000..061467bbad --- /dev/null +++ b/test_runner/performance/test_cumulative_statistics_persistence.py @@ -0,0 +1,221 @@ +import math # Add this import +import time +import traceback +from pathlib import Path + +import psycopg2 +import psycopg2.extras +import pytest +from fixtures.benchmark_fixture import NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_api import NeonAPI, connection_parameters_to_env +from fixtures.neon_fixtures import PgBin +from fixtures.pg_version import PgVersion + +vacuum_times_sql = """ +SELECT + relname AS table_name, + last_autovacuum, + last_autoanalyze +FROM + pg_stat_user_tables where relname = 'pgbench_accounts' +ORDER BY + last_autovacuum DESC, last_autoanalyze DESC +""" + + +def insert_first_chunk_and_verify_autovacuum_is_not_running( + cur, rows_to_insert, autovacuum_naptime +): + cur.execute(f""" + INSERT INTO pgbench_accounts (aid, bid, abalance, filler) + SELECT + aid, + (random() * 10)::int + 1 AS bid, + (random() * 10000)::int AS abalance, + 'filler text' AS filler + FROM generate_series(6800001, {6800001 + rows_to_insert - 1}) AS aid; + """) + assert cur.rowcount == rows_to_insert + for _ in range(5): + time.sleep(0.5 * autovacuum_naptime) + cur.execute(vacuum_times_sql) + row = cur.fetchall()[0] + log.info(f"last_autovacuum: {row[1]}, last_autoanalyze: {row[2]}") + assert row[1] is None + + +def insert_second_chunk_and_verify_autovacuum_is_now_running( + cur, rows_to_insert, autovacuum_naptime +): + cur.execute(f""" + INSERT INTO pgbench_accounts (aid, bid, abalance, filler) + SELECT + aid, + (random() * 10)::int + 1 AS bid, + (random() * 10000)::int AS abalance, + 'filler text' AS filler + FROM generate_series({6800001 + rows_to_insert}, {6800001 + rows_to_insert * 2 - 1}) AS aid; + """) + assert cur.rowcount == rows_to_insert + for _ in range(5): + time.sleep(0.5 * autovacuum_naptime) + cur.execute(vacuum_times_sql) + row = cur.fetchall()[0] + log.info(f"last_autovacuum: {row[1]}, last_autoanalyze: {row[2]}") + assert row[1] is not None + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(60 * 60) +def test_cumulative_statistics_persistence( + pg_bin: PgBin, + test_output_dir: Path, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Verifies that the cumulative statistics are correctly persisted across restarts. + Cumulative statistics are important to persist across restarts because they are used + when auto-vacuum an auto-analyze trigger conditions are met. + The test performs the following steps: + - Seed a new project using pgbench + - insert tuples that by itself are not enough to trigger auto-vacuum + - suspend the endpoint + - resume the endpoint + - insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are + - verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension + """ + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + endpoint_id = project["endpoints"][0]["id"] + region_id = project["project"]["region_id"] + log.info(f"Created project {project_id} with endpoint {endpoint_id} in region {region_id}") + error_occurred = False + try: + connstr = project["connection_uris"][0]["connection_uri"] + env = connection_parameters_to_env(project["connection_uris"][0]["connection_parameters"]) + # seed about 1 GiB of data into pgbench_accounts + pg_bin.run_capture(["pgbench", "-i", "-s68"], env=env) + + # assert rows in pgbench_accounts is 6800000 rows + conn = psycopg2.connect(connstr) + conn.autocommit = True + with conn.cursor() as cur: + # assert rows in pgbench_accounts is 6800000 rows + cur.execute("select count(*) from pgbench_accounts") + row_count = cur.fetchall()[0][0] + assert row_count == 6800000 + + # verify n_tup_ins, n_live_tup, vacuum_count, analyze_count (manual vacuum and analyze) + cur.execute( + "select n_tup_ins, vacuum_count,analyze_count from pg_stat_user_tables where relname = 'pgbench_accounts'" + ) + row = cur.fetchall()[0] + assert row[0] == 6800000 # n_tup_ins + assert row[1] == 1 # vacuum_count + assert row[2] == 1 # analyze_count + + # retrieve some GUCs (postgres settings) relevant to autovacuum + cur.execute( + "SELECT setting::int AS autovacuum_naptime FROM pg_settings WHERE name = 'autovacuum_naptime'" + ) + autovacuum_naptime = cur.fetchall()[0][0] + assert autovacuum_naptime < 300 and autovacuum_naptime > 0 + cur.execute( + "SELECT setting::float AS autovacuum_vacuum_insert_scale_factor FROM pg_settings WHERE name = 'autovacuum_vacuum_insert_scale_factor'" + ) + autovacuum_vacuum_insert_scale_factor = cur.fetchall()[0][0] + assert ( + autovacuum_vacuum_insert_scale_factor > 0.05 + and autovacuum_vacuum_insert_scale_factor < 1.0 + ) + cur.execute( + "SELECT setting::int AS autovacuum_vacuum_insert_threshold FROM pg_settings WHERE name = 'autovacuum_vacuum_insert_threshold'" + ) + autovacuum_vacuum_insert_threshold = cur.fetchall()[0][0] + cur.execute( + "SELECT setting::int AS pgstat_file_size_limit FROM pg_settings WHERE name = 'neon.pgstat_file_size_limit'" + ) + pgstat_file_size_limit = cur.fetchall()[0][0] + assert pgstat_file_size_limit > 10 * 1024 # at least 10 MB + + # insert rows that by itself are not enough to trigger auto-vacuum + # vacuum insert threshold = vacuum base insert threshold + vacuum insert scale factor * number of tuples + # https://www.postgresql.org/docs/17/routine-vacuuming.html + rows_to_insert = int( + math.ceil( + autovacuum_vacuum_insert_threshold / 2 + + row_count * autovacuum_vacuum_insert_scale_factor * 0.6 + ) + ) + + log.info( + f"autovacuum_vacuum_insert_scale_factor: {autovacuum_vacuum_insert_scale_factor}, autovacuum_vacuum_insert_threshold: {autovacuum_vacuum_insert_threshold}, row_count: {row_count}" + ) + log.info( + f"Inserting {rows_to_insert} rows, which is below the 'vacuum insert threshold'" + ) + + insert_first_chunk_and_verify_autovacuum_is_not_running( + cur, rows_to_insert, autovacuum_naptime + ) + + conn.close() + + # suspend the endpoint + log.info(f"Suspending endpoint {endpoint_id}") + neon_api.suspend_endpoint(project_id, endpoint_id) + neon_api.wait_for_operation_to_finish(project_id) + time.sleep(60) # give some time in between suspend and resume + + # resume the endpoint + log.info(f"Starting endpoint {endpoint_id}") + neon_api.start_endpoint(project_id, endpoint_id) + neon_api.wait_for_operation_to_finish(project_id) + + conn = psycopg2.connect(connstr) + conn.autocommit = True + with conn.cursor() as cur: + # insert additional rows that by itself are not enough to trigger auto-vacuum, but in combination + # with the previous rows inserted before the suspension are + log.info( + f"Inserting another {rows_to_insert} rows, which is below the 'vacuum insert threshold'" + ) + insert_second_chunk_and_verify_autovacuum_is_now_running( + cur, rows_to_insert, autovacuum_naptime + ) + + # verify estimatednumber of tuples in pgbench_accounts is within 6800000 + inserted rows +- 2 % + cur.execute( + "select reltuples::bigint from pg_class where relkind = 'r' and relname = 'pgbench_accounts'" + ) + reltuples = cur.fetchall()[0][0] + assert reltuples > 6800000 + rows_to_insert * 2 * 0.98 + assert reltuples < 6800000 + rows_to_insert * 2 * 1.02 + + # verify exact number of pgbench_accounts rows (computed row_count) + cur.execute("select count(*) from pgbench_accounts") + row_count = cur.fetchall()[0][0] + assert row_count == 6800000 + rows_to_insert * 2 + + # verify n_tup_ins, n_live_tup, vacuum_count, analyze_count (manual vacuum and analyze) + cur.execute( + "select n_tup_ins, vacuum_count,analyze_count from pg_stat_user_tables where relname = 'pgbench_accounts'" + ) + row = cur.fetchall()[0] + assert row[0] == 6800000 + rows_to_insert * 2 + assert row[1] == 1 + assert row[2] == 1 + + conn.close() + + except Exception as e: + error_occurred = True + log.error(f"Caught exception: {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred # Fail the test if an error occurred + neon_api.delete_project(project_id) diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index acb7b56fd0..7c9e9f47c8 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -69,6 +69,9 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma env.create_branch("child") branch_created += 1 + # Ensure L0 layers are compacted so that gc-compaction doesn't get preempted. + client.timeline_checkpoint(tenant_id, timeline_id, force_l0_compaction=True) + max_num_of_deltas_above_image = 0 max_total_num_of_deltas = 0 for key_range in client.perf_info(tenant_id, timeline_id): diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index fdc56cc496..807ed522e1 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -23,6 +23,25 @@ if TYPE_CHECKING: from psycopg2.extensions import connection, cursor +""" +These benchmarks stress test logical replication within Neon. In order to run +them locally, they require setting up some infrastructure. See +https://docs.neon.build/compute/logical_replication_benchmarks.html for how to +do that. After setting that up, run the following shell commands. + +# These are the project IDs setup for the purposes of running these benchmarks +export BENCHMARK_PROJECT_ID_PUB= +export BENCHMARK_PROJECT_ID_SUB= + +# See https://neon.tech/docs/manage/api-keys +export NEON_API_KEY= + +# Fiddling with the --timeout parameter may be required depending on the +# performance of the benchmark +pytest -m remote_cluster 'test_runner/performance/test_logical_replication.py' +""" + + @pytest.mark.timeout(1000) def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg: VanillaPostgres): env = neon_simple_env diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py index 0ee0efe8b9..e2f0a79018 100644 --- a/test_runner/performance/test_perf_many_relations.py +++ b/test_runner/performance/test_perf_many_relations.py @@ -2,8 +2,10 @@ import os from pathlib import Path import pytest +from fixtures.benchmark_fixture import NeonBenchmarker from fixtures.compare_fixtures import RemoteCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder def get_num_relations(default: int = 1000) -> list[int]: @@ -64,3 +66,59 @@ def test_perf_many_relations(remote_compare: RemoteCompare, num_relations: int): env.pg_bin.run_capture( ["psql", env.pg.connstr(options="-cstatement_timeout=1000s "), "-c", sql] ) + + +def test_perf_simple_many_relations_reldir_v2( + neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker +): + """ + Test creating many relations in a single database. + """ + env = neon_env_builder.init_start(initial_tenant_conf={"rel_size_v2_enabled": "true"}) + ep = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers=1000MB", + "max_locks_per_transaction=16384", + ], + ) + + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + != "legacy" + ) + + n = 100000 + step = 5000 + # Create many relations + log.info(f"Creating {n} relations...") + begin = 0 + with zenbenchmark.record_duration("create_first_relation"): + ep.safe_psql("CREATE TABLE IF NOT EXISTS table_begin (id SERIAL PRIMARY KEY, data TEXT)") + with zenbenchmark.record_duration("create_many_relations"): + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; + END LOOP; + END $$; + """, + "COMMIT", + ] + ) + begin = end + if begin >= n: + break + with zenbenchmark.record_duration("create_last_relation"): + ep.safe_psql(f"CREATE TABLE IF NOT EXISTS table_{begin} (id SERIAL PRIMARY KEY, data TEXT)") diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py new file mode 100644 index 0000000000..842e6a904b --- /dev/null +++ b/test_runner/performance/test_perf_oltp_large_tenant.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +import os +import timeit +from contextlib import closing +from pathlib import Path + +import pytest +from fixtures.benchmark_fixture import PgBenchRunResult +from fixtures.compare_fixtures import PgCompare +from fixtures.log_helper import log + +from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp + + +def get_custom_scripts( + default: str = "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4", +) -> list[str]: + # We parametrize each run with the custom scripts to run and their weights. + # The custom scripts and their weights are passed through TEST_PGBENCH_CUSTOM_SCRIPTS env variable. + # Delimit the custom scripts for one run by spaces and for different runs by commas, for example: + # "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4,insert_webhooks.sql@8 select_any_webhook_with_skew.sql@2" + # Databases/branches are pre-created and passed through BENCHMARK_CONNSTR env variable. + scripts = os.getenv("TEST_PGBENCH_CUSTOM_SCRIPTS", default=str(default)) + rv = [] + for s in scripts.split(","): + rv.append(s) + return rv + + +def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int): + password = env.pg.default_options.get("password", None) + options = env.pg.default_options.get("options", "") + # drop password from the connection string by passing password=None and set password separately + connstr = env.pg.connstr(password=None, options=options) + # if connstr does not contain pooler we can set statement_timeout to 0 + if "pooler" not in connstr: + options = "-cstatement_timeout=0 " + env.pg.default_options.get("options", "") + connstr = env.pg.connstr(password=None, options=options) + + script_args = [ + "pgbench", + "-n", # no explicit vacuum before the test - we want to rely on auto-vacuum + "-M", + "prepared", + "--client=500", + "--jobs=100", + f"-T{duration}", + "-P60", # progress every minute + "--progress-timestamp", + ] + for script in custom_scripts.split(): + script_args.extend(["-f", f"test_runner/performance/large_synthetic_oltp/{script}"]) + script_args.append(connstr) + + run_pgbench( + env, + "custom-scripts", + script_args, + password=password, + ) + + +def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): + environ: dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = env.pg_bin.run_capture(cmdline, env=environ) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + env.flush() + + stdout = Path(f"{out}.stdout").read_text() + + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + env.zenbenchmark.record_pg_bench_result(prefix, res) + + +def run_database_maintenance(env: PgCompare): + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + log.info("start vacuum analyze transaction.transaction") + with env.zenbenchmark.record_duration("vacuum_analyze"): + cur.execute("SET statement_timeout = 0;") + cur.execute("SET max_parallel_maintenance_workers = 7;") + cur.execute("SET maintenance_work_mem = '10GB';") + cur.execute("vacuum analyze transaction.transaction;") + log.info("finished vacuum analyze transaction.transaction") + + # recover previously failed or canceled re-indexing + cur.execute( + """ + DO $$ + DECLARE + invalid_index TEXT; + BEGIN + FOR invalid_index IN + SELECT c.relname + FROM pg_class c + JOIN pg_index i ON i.indexrelid = c.oid + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = 'transaction' + AND i.indisvalid = FALSE + AND c.relname LIKE '%_ccnew%' + LOOP + EXECUTE 'DROP INDEX IF EXISTS transaction.' || invalid_index; + END LOOP; + END $$; + """ + ) + # also recover failed or canceled re-indexing on toast part of table + cur.execute( + """ + DO $$ + DECLARE + invalid_index TEXT; + BEGIN + FOR invalid_index IN + SELECT c.relname + FROM pg_class c + JOIN pg_index i ON i.indexrelid = c.oid + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = 'pg_toast' + AND i.indisvalid = FALSE + AND c.relname LIKE '%_ccnew%' + AND i.indrelid = ( + SELECT reltoastrelid FROM pg_class + WHERE relname = 'transaction' + AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'transaction') + ) + LOOP + EXECUTE 'DROP INDEX IF EXISTS pg_toast.' || invalid_index; + END LOOP; + END $$; + """ + ) + + log.info("start REINDEX TABLE CONCURRENTLY transaction.transaction") + with env.zenbenchmark.record_duration("reindex concurrently"): + cur.execute("REINDEX TABLE CONCURRENTLY transaction.transaction;") + log.info("finished REINDEX TABLE CONCURRENTLY transaction.transaction") + + +@pytest.mark.parametrize("custom_scripts", get_custom_scripts()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_perf_oltp_large_tenant_pgbench( + remote_compare: PgCompare, custom_scripts: str, duration: int +): + run_test_pgbench(remote_compare, custom_scripts, duration) + + +@pytest.mark.remote_cluster +def test_perf_oltp_large_tenant_maintenance(remote_compare: PgCompare): + # run analyze, vacuum, re-index after the test and measure and report its duration + run_database_maintenance(remote_compare) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index d45db28c78..e897d53cc8 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -16,6 +16,7 @@ from fixtures.neon_fixtures import ( NeonPageserver, PageserverAvailability, PageserverSchedulingPolicy, + StorageControllerMigrationConfig, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pg_version import PgVersion @@ -82,9 +83,7 @@ def test_storage_controller_many_tenants( "max_offline": "30s", "max_warming_up": "300s", } - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api AZS = ["alpha", "bravo", "charlie"] @@ -362,7 +361,10 @@ def test_storage_controller_many_tenants( dest_ps_id = desc["shards"][shard_number]["node_secondary"][0] f = executor.submit( - env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id + env.storage_controller.tenant_shard_migrate, + tenant_shard_id, + dest_ps_id, + StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), ) elif op == Operation.TENANT_PASSTHROUGH: # A passthrough read to shard zero diff --git a/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json deleted file mode 100644 index af49dfa0c0..0000000000 --- a/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "public_extensions": [ - "anon", - "pg_buffercache" - ], - "library_index": { - "anon": "anon", - "pg_buffercache": "pg_buffercache" - }, - "extension_data": { - "pg_buffercache": { - "control_data": { - "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true" - }, - "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst" - }, - "anon": { - "control_data": { - "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" - }, - "archive_path": "5670669815/v14/extensions/anon.tar.zst" - } - } -} \ No newline at end of file diff --git a/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst b/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst deleted file mode 100644 index 5c17630109..0000000000 Binary files a/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst and /dev/null differ diff --git a/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst b/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst deleted file mode 100644 index 69648a2f1a..0000000000 Binary files a/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst and /dev/null differ diff --git a/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json deleted file mode 100644 index fd0d1edc3c..0000000000 --- a/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "public_extensions": [ - "anon" - ], - "library_index": { - "anon": "anon" - }, - "extension_data": { - "anon": { - "control_data": { - "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" - }, - "archive_path": "5670669815/v15/extensions/anon.tar.zst" - } - } -} - diff --git a/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst b/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst deleted file mode 100644 index ea7034578f..0000000000 Binary files a/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst and /dev/null differ diff --git a/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json deleted file mode 100644 index 1157e0d032..0000000000 --- a/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "public_extensions": [], - "library_index": { - "TODO": "We still need PG16 extensions" - }, - "extension_data": {} -} \ No newline at end of file diff --git a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json deleted file mode 100644 index 7990b2c3a2..0000000000 --- a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "public_extensions": [], - "library_index": { - "TODO": "We still need PG17 extensions" - }, - "extension_data": {} -} \ No newline at end of file diff --git a/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0--1.1.sql b/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0--1.1.sql new file mode 100644 index 0000000000..1fb183dcae --- /dev/null +++ b/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0--1.1.sql @@ -0,0 +1,10 @@ +\echo Use "ALTER EXTENSION test_extension UPDATE TO '1.1'" to load this file. \quit + +CREATE FUNCTION test_extension.fun_fact() +RETURNS void +IMMUTABLE LEAKPROOF PARALLEL SAFE +AS $$ +BEGIN + RAISE NOTICE 'Neon has a melting point of -246.08 C'; +END; +$$ LANGUAGE 'plpgsql'; diff --git a/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0.sql b/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0.sql new file mode 100644 index 0000000000..b51e3ed19f --- /dev/null +++ b/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0.sql @@ -0,0 +1,12 @@ +\echo Use "CREATE EXTENSION test_extension" to load this file. \quit + +CREATE SCHEMA test_extension; + +CREATE FUNCTION test_extension.motd() +RETURNS void +IMMUTABLE LEAKPROOF PARALLEL SAFE +AS $$ +BEGIN + RAISE NOTICE 'Have a great day'; +END; +$$ LANGUAGE 'plpgsql'; diff --git a/test_runner/regress/data/test_remote_extensions/test_extension/test_extension.control b/test_runner/regress/data/test_remote_extensions/test_extension/test_extension.control new file mode 100644 index 0000000000..826f643daf --- /dev/null +++ b/test_runner/regress/data/test_remote_extensions/test_extension/test_extension.control @@ -0,0 +1 @@ +comment = 'Test extension' diff --git a/test_runner/regress/data/test_signed_char.out b/test_runner/regress/data/test_signed_char.out new file mode 100644 index 0000000000..a68876e383 --- /dev/null +++ b/test_runner/regress/data/test_signed_char.out @@ -0,0 +1 @@ +0000000094010815f81f042000000000b89f8000909f5000689f5000489f4000309f3000189f3000009f3000e89e3000d09e3000b89e3000a09e3000889e3000709e3000309e8000189e3000009e3000e89d3000d09d3000b89d3000a09d3000889d3000709d3000589d3000409d3000289d3000109d3000f89c3000e09c3000c89c3000b09c3000989c3000809c3000689c3000509c3000389c3000209c3000089c3000f09b3000d89b3000c09b3000a89b3000909b3000789b3000609b3000489b3000309b3000189b3000009b3000e89a3000d09a3000b89a3000a09a3000889a3000489a8000309a3000189a3000009a3000e8993000d0993000b8993000a09930008899300070993000589930004099300000998000e8983000d0983000b8983000a0983000889830007098300058983000409830002898300010983000f8973000b8978000a09730008897300070973000589730004097300028973000e8968000a89680006896800028968000e8958000a8958000909530005095800038953000209530000895300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000801000010018004c198900000000000000000029000000008010000100180049787f000000000000000000290000000080100001001800727c7000000000000000000029000000008010002800400020766200000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100001001800207262000000000000000000290000000080100028004000766239000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000280040006239380000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100008010002800400039383700000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100028004000383736000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000280040003736350000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100008010002800400036353400000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100001001800203034000000000000000000280000000080100001001800203933000000000000000000270000000080100001001800203833000000000000000000260000000080100001001800203733000000000000000000250000000080100001001800203633000000000000000000240000000080100001001800203533000000000000000000230000000080100028004000353433000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002034330000000000000000002200000000801000010018002033330000000000000000002100000000801000010018002032330000000000000000002000000000801000010018002031330000000000000000001f00000000801000010018002030330000000000000000001e00000000801000010018002039320000000000000000001d00000000801000010018002038320000000000000000001c00000000801000010018002037320000000000000000001b00000000801000010018002036320000000000000000001a0000000080100001001800203532000000000000000000190000000080100001001800203432000000000000000000180000000080100028004000343332000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002033320000000000000000001700000000801000010018002032320000000000000000001600000000801000010018002031320000000000000000001500000000801000010018002030320000000000000000001400000000801000010018002039310000000000000000001300000000801000010018002038310000000000000000001200000000801000010018002037310000000000000000001100000000801000010018002036310000000000000000001000000000801000010018002035310000000000000000000f00000000801000010018002034310000000000000000000e00000000801000010018002033310000000000000000000d0000000080100028004000333231000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002032310000000000000000000c00000000801000010018002031310000000000000000000b00000000801000010018002030310000000000000000000a00000000801000010018002039200000000000000000000900000000801000010018002038200000000000000000000800000000801000010018002037200000000000000000000700000000801000010018002036200000000000000000000600000000801000010018002035200000000000000000000500000000801000010018003034200000000000000000002800000000801000010018002034200000000000000000000400000000801000010018003933200000000000000000002700000000801000010018003833200000000000000000002600000000801000010018003733200000000000000000002500000000801000010018003633200000000000000000002400000000801000010018003533200000000000000000002300000000801000010018003433200000000000000000002200000000801000010018003333200000000000000000002100000000801000010018003233200000000000000000002000000000801000010018003133200000000000000000001f00000000801000010018003033200000000000000000001e00000000801000010018002033200000000000000000000300000000801000010018003932200000000000000000001d00000000801000010018003832200000000000000000001c00000000801000010018003732200000000000000000001b00000000801000010018003632200000000000000000001a00000000801000010018003532200000000000000000001900000000801000010018003432200000000000000000001800000000801000010018003332200000000000000000001700000000801000010018003232200000000000000000001600000000801000010018003132200000000000000000001500000000801000010018003032200000000000000000001400000000801000010018002032200000000000000000000200000000801000010018003931200000000000000000001300000000801000010018003831200000000000000000001200000000801000010018003731200000000000000000001100000000801000010018003631200000000000000000001000000000801000010018003531200000000000000000000f00000000801000010018003431200000000000000000000e00000000801000010018003331200000000000000000000d0000000080100028004000323120000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018003131200000000000000000000b00000000801000010018003031200000000000000000000a0000000080100001001800203120000000000000000000010000000080100001001800622020000000000000000000290000000080100001001800392020000000000000000000090000000080100001001800382020000000000000000000080000000080100001001800372020000000000000000000070000000080100001001800362020000000000000000000060000000080100001001800352020000000000000000000050000000080100002002000342020000000000000000000040001002400000000000000008010000b00280033202000000000000000000003000a001b010101010101010101000000000000008010000b00280032202000000000000000000002000a001201010101010101010100000000000000801000280040003120200000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100ffffffff00000200 \ No newline at end of file diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 07600dd911..b56fcd3500 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -144,7 +144,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_l0_semaphore": False, "l0_flush_delay_threshold": 25, "l0_flush_stall_threshold": 42, - "l0_flush_wait_upload": False, + "l0_flush_wait_upload": True, "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index c0c9537421..bfc5cb174e 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -7,6 +7,7 @@ import psycopg2.errors import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import USE_LFC @pytest.mark.timeout(600) @@ -80,3 +81,193 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): # do a graceful shutdown which would had caught the allowed_errors before # https://github.com/neondatabase/neon/pull/8632 env.pageserver.stop() + + +def test_compute_pageserver_hung_connections(neon_env_builder: NeonEnvBuilder): + """ + Test timeouts in waiting for response to pageserver request + """ + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*slow GetPage.*") + pageserver_http = env.pageserver.http_client() + endpoint = env.endpoints.create_start( + "main", + tenant_id=env.initial_tenant, + config_lines=["autovacuum = off"], + ) + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.debug(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + # Print the backend PID so that it can be compared with the logs easily + cur.execute("SELECT pg_backend_pid()") + row = cur.fetchone() + assert row is not None + log.info(f"running test workload in backend PID {row[0]}") + + def run_workload(duration: float): + end_time = time.time() + duration + times_executed = 0 + while time.time() < end_time: + if random.random() < 0.5: + cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')") + else: + cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") + cur.fetchall() + times_executed += 1 + log.info(f"Workload executed {times_executed} times") + assert times_executed > 0 + + ## Test short connection hiccups + ## + ## This is to exercise the logging timeout. + log.info("running workload with log timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '500ms'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)")) + run_workload(20) + + # check that the message was logged + assert endpoint.log_contains("no response received from pageserver for .* s, still waiting") + assert endpoint.log_contains("received response from pageserver after .* s") + + ## Test connections that are hung for longer + ## + ## This exercises the disconnect timeout. We'll disconnect and + ## reconnect after 500 ms. + log.info("running workload with disconnect timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '250ms'") + cur.execute("SET neon.pageserver_response_disconnect_timeout = '500ms'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)")) + run_workload(15) + + assert endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() + + +def test_compute_pageserver_statement_timeout(neon_env_builder: NeonEnvBuilder): + """ + Test statement_timeout while waiting for response to pageserver request + """ + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*slow GetPage.*") + pageserver_http = env.pageserver.http_client() + + # Make sure the shared_buffers and LFC are tiny, to ensure the queries + # hit the storage. Disable autovacuum to make the test more deterministic. + config_lines = [ + "shared_buffers='512kB'", + "autovacuum = off", + ] + if USE_LFC: + config_lines = ["neon.max_file_cache_size = 1MB", "neon.file_cache_size_limit = 1MB"] + endpoint = env.endpoints.create_start( + "main", + tenant_id=env.initial_tenant, + config_lines=config_lines, + ) + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Disable parallel query. Parallel workers open their own pageserver connections, + # which messes up the test logic. + cur.execute("SET max_parallel_workers_per_gather=0") + cur.execute("SET effective_io_concurrency=0") + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.debug(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + ## Run a query until the compute->pageserver connection hits the failpoint and + ## get stuck. This tests that the statement_timeout is obeyed while waiting on a + ## GetPage request. + log.info("running workload with statement_timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '2000ms'") + cur.execute("SET neon.pageserver_response_disconnect_timeout = '30000ms'") + cur.execute("SET statement_timeout='10s'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%return(60000)")) + + start_time = time.time() + with pytest.raises(psycopg2.errors.QueryCanceled): + cur.execute("SELECT count(*) FROM foo") + cur.fetchall() + log.info("Statement timeout reached") + end_time = time.time() + # Verify that the statement_timeout canceled the query before + # neon.pageserver_response_disconnect_timeout expired + assert end_time - start_time < 40 + times_canceled = 1 + + # Should not have disconnected yet + assert not endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # Clear the failpoint. This doesn't affect the connection that already hit it. It + # will keep waiting. But subsequent connections will work normally. + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "off")) + + # If we keep retrying, we should eventually succeed. (This tests that the + # neon.pageserver_response_disconnect_timeout is not reset on query + # cancellation.) + while times_canceled < 10: + try: + cur.execute("SELECT count(*) FROM foo") + cur.fetchall() + log.info("Statement succeeded") + break + except psycopg2.errors.QueryCanceled: + log.info("Statement timed out, retrying") + times_canceled += 1 + assert times_canceled > 1 and times_canceled < 10 + + assert endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 34e4e994cb..85d0cfbf1d 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -19,6 +19,7 @@ from fixtures.pageserver.utils import wait_until_tenant_active from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException +from requests.exceptions import RetryError # Test branch creation @@ -180,7 +181,6 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.endpoints.create_start( initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2 ) - ps_http.configure_failpoints(("before-upload-index-pausable", "off")) finally: env.pageserver.stop(immediate=True) @@ -221,10 +221,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder branch_id = TimelineId.generate() - with pytest.raises( - PageserverApiException, - match="Cannot branch off the timeline that's not present in pageserver", - ): + with pytest.raises(RetryError, match="too many 503 error responses"): ps_http.timeline_create( env.pg_version, env.initial_tenant, diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 41aa5b47ca..5526b783d5 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -23,8 +23,8 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver): ) env = neon_env_builder.init_start() - neon_env_builder.control_plane_compute_hook_api = ( - f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + neon_env_builder.control_plane_hooks_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/" ) def ignore_notify(request: Request): diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index c091cd0869..c8cce7a4e7 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -319,8 +319,12 @@ def test_pageserver_gc_compaction_idempotent( }, ) wait_until(compaction_finished, timeout=60) + workload.validate(env.pageserver.id) + # Ensure all data are uploaded so that the duplicated layer gets into index_part.json + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_flushed=True) if compaction_mode == "after_restart": env.pageserver.restart(True) + workload.validate(env.pageserver.id) ps_http.timeline_gc( tenant_id, timeline_id, None ) # Force refresh gc info to have gc_cutoff generated @@ -335,6 +339,7 @@ def test_pageserver_gc_compaction_idempotent( "sub_compaction_max_job_size_mb": 16, }, ) + workload.validate(env.pageserver.id) wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) @@ -466,6 +471,95 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder): ps_http.timeline_gc(tenant_id, timeline_id, None) +@skip_in_debug_build("only run with release build") +def test_pageserver_gc_compaction_trigger(neon_env_builder: NeonEnvBuilder): + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": f"{1024 * 16}", + "lsn_lease_length": "0s", + "gc_compaction_enabled": "true", + "gc_compaction_initial_threshold_kb": "16", + "gc_compaction_ratio_percent": "50", + # Do not generate image layers with create_image_layers + "image_layer_creation_check_threshold": "100", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 20 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + workload.churn_rows(row_count, env.pageserver.id, upload=True) + wait_until(compaction_finished, timeout=60) + workload.validate(env.pageserver.id) + + # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + +def test_pageserver_small_tenant_compaction(neon_env_builder: NeonEnvBuilder): + """ + Create a small tenant that rarely needs compaction and ensure that everything works. + """ + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": 1024, + "lsn_lease_length": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(10000, env.pageserver.id) + + for _ in range(100): + workload.churn_rows(10, env.pageserver.id, upload=False, ingest=False) + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_compact(tenant_id, timeline_id) + ps_http.timeline_gc(tenant_id, timeline_id, None) + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + # Stripe sizes in number of pages. TINY_STRIPES = 16 LARGE_STRIPES = 32768 diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 3a08671bbf..0d3618d1b8 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -5,34 +5,74 @@ import logging import requests from fixtures.neon_fixtures import NeonEnv, logical_replication_sync +TEST_ROLE_NAMES = [ + {"name": "neondb_owner"}, + {"name": "role with spaces"}, + {"name": "role with%20spaces "}, + {"name": "role with whitespaces "}, + {"name": "injective role with spaces'; SELECT pg_sleep(1000);"}, + {"name": "role with #pound-sign and &ersands=true"}, + {"name": "role with emoji 🌍"}, + {"name": "role \";with ';injections $$ $x$ $ %I !/\\&#@"}, + {"name": '"role in double quotes"'}, + {"name": "'role in single quotes'"}, + {"name": "role$"}, + {"name": "role$$"}, + {"name": "role$x$"}, +] + TEST_DB_NAMES = [ { "name": "neondb", - "owner": "cloud_admin", + "owner": "neondb_owner", }, { "name": "db with spaces", - "owner": "cloud_admin", + "owner": "role with spaces", }, { "name": "db with%20spaces ", - "owner": "cloud_admin", + "owner": "role with%20spaces ", }, { "name": "db with whitespaces ", - "owner": "cloud_admin", + "owner": "role with whitespaces ", }, { - "name": "injective db with spaces'; SELECT pg_sleep(10);", - "owner": "cloud_admin", + "name": "injective db with spaces'; SELECT pg_sleep(1000);", + "owner": "injective role with spaces'; SELECT pg_sleep(1000);", }, { "name": "db with #pound-sign and &ersands=true", - "owner": "cloud_admin", + "owner": "role with #pound-sign and &ersands=true", }, { "name": "db with emoji 🌍", - "owner": "cloud_admin", + "owner": "role with emoji 🌍", + }, + { + "name": "db \";with ';injections $$ $x$ $ %I !/\\&#@", + "owner": "role \";with ';injections $$ $x$ $ %I !/\\&#@", + }, + { + "name": '"db in double quotes"', + "owner": '"role in double quotes"', + }, + { + "name": "'db in single quotes'", + "owner": "'role in single quotes'", + }, + { + "name": "db name$", + "owner": "role$", + }, + { + "name": "db name$$", + "owner": "role$$", + }, + { + "name": "db name$x$", + "owner": "role$x$", }, ] @@ -52,6 +92,7 @@ def test_compute_catalog(neon_simple_env: NeonEnv): **{ "skip_pg_catalog_updates": False, "cluster": { + "roles": TEST_ROLE_NAMES, "databases": TEST_DB_NAMES, }, } @@ -99,10 +140,10 @@ def test_compute_catalog(neon_simple_env: NeonEnv): ), f"Expected 404 status code, but got {e.response.status_code}" -def test_compute_create_databases(neon_simple_env: NeonEnv): +def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): """ - Test that compute_ctl can create and work with databases with special - characters (whitespaces, %, tabs, etc.) in the name. + Test that compute_ctl can create and work with databases and roles + with special characters (whitespaces, %, tabs, etc.) in the name. """ env = neon_simple_env @@ -116,6 +157,7 @@ def test_compute_create_databases(neon_simple_env: NeonEnv): **{ "skip_pg_catalog_updates": False, "cluster": { + "roles": TEST_ROLE_NAMES, "databases": TEST_DB_NAMES, }, } @@ -139,6 +181,43 @@ def test_compute_create_databases(neon_simple_env: NeonEnv): assert len(curr_db) == 1 assert curr_db[0] == db["name"] + for role in TEST_ROLE_NAMES: + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = %s", (role["name"],)) + catalog_role = cursor.fetchone() + assert catalog_role is not None + assert catalog_role[0] == role["name"] + + delta_operations = [] + for db in TEST_DB_NAMES: + delta_operations.append({"action": "delete_db", "name": db["name"]}) + for role in TEST_ROLE_NAMES: + delta_operations.append({"action": "delete_role", "name": role["name"]}) + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [], + "databases": [], + }, + "delta_operations": delta_operations, + } + ) + endpoint.reconfigure() + + for db in TEST_DB_NAMES: + with endpoint.cursor() as cursor: + cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (db["name"],)) + catalog_db = cursor.fetchone() + assert catalog_db is None + + for role in TEST_ROLE_NAMES: + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = %s", (role["name"],)) + catalog_role = cursor.fetchone() + assert catalog_role is None + def test_dropdb_with_subscription(neon_simple_env: NeonEnv): """ @@ -150,17 +229,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): # stuff into the spec.json file. endpoint = env.endpoints.create_start("main") + SUB_DB_NAME = "';subscriber_db $$ $x$ $;" + PUB_DB_NAME = "publisher_db" TEST_DB_NAMES = [ { "name": "neondb", "owner": "cloud_admin", }, { - "name": "subscriber_db", + "name": SUB_DB_NAME, "owner": "cloud_admin", }, { - "name": "publisher_db", + "name": PUB_DB_NAME, "owner": "cloud_admin", }, ] @@ -177,47 +258,47 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): ) endpoint.reconfigure() - # connect to the publisher_db and create a publication - with endpoint.cursor(dbname="publisher_db") as cursor: + # Connect to the PUB_DB_NAME and create a publication + with endpoint.cursor(dbname=PUB_DB_NAME) as cursor: cursor.execute("CREATE PUBLICATION mypub FOR ALL TABLES") cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');") cursor.execute("CREATE TABLE t(a int)") cursor.execute("INSERT INTO t VALUES (1)") cursor.execute("CHECKPOINT") - # connect to the subscriber_db and create a subscription - # Note that we need to create subscription with - connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''") - with endpoint.cursor(dbname="subscriber_db") as cursor: + # Connect to the SUB_DB_NAME and create a subscription + # Note that we need to create subscription with the following connstr: + connstr = endpoint.connstr(dbname=PUB_DB_NAME).replace("'", "''") + with endpoint.cursor(dbname=SUB_DB_NAME) as cursor: cursor.execute("CREATE TABLE t(a int)") cursor.execute( - f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub WITH (create_slot = false) " + f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub WITH (create_slot = false) " ) - # wait for the subscription to be active + # Wait for the subscription to be active logical_replication_sync( endpoint, endpoint, "mysub", - sub_dbname="subscriber_db", - pub_dbname="publisher_db", + sub_dbname=SUB_DB_NAME, + pub_dbname=PUB_DB_NAME, ) # Check that replication is working - with endpoint.cursor(dbname="subscriber_db") as cursor: + with endpoint.cursor(dbname=SUB_DB_NAME) as cursor: cursor.execute("SELECT * FROM t") rows = cursor.fetchall() assert len(rows) == 1 assert rows[0][0] == 1 - # drop the subscriber_db from the list + # Drop the SUB_DB_NAME from the list TEST_DB_NAMES_NEW = [ { "name": "neondb", "owner": "cloud_admin", }, { - "name": "publisher_db", + "name": PUB_DB_NAME, "owner": "cloud_admin", }, ] @@ -230,7 +311,7 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): "databases": TEST_DB_NAMES_NEW, }, "delta_operations": [ - {"action": "delete_db", "name": "subscriber_db"}, + {"action": "delete_db", "name": SUB_DB_NAME}, # also test the case when we try to delete a non-existent database # shouldn't happen in normal operation, # but can occur when failed operations are retried @@ -239,32 +320,35 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): } ) - logging.info("Reconfiguring the endpoint to drop the subscriber_db") + logging.info(f"Reconfiguring the endpoint to drop the {SUB_DB_NAME} database") endpoint.reconfigure() - # Check that the subscriber_db is dropped + # Check that the SUB_DB_NAME is dropped with endpoint.cursor() as cursor: - cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", ("subscriber_db",)) + cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (SUB_DB_NAME,)) catalog_db = cursor.fetchone() assert catalog_db is None - # Check that we can still connect to the publisher_db - with endpoint.cursor(dbname="publisher_db") as cursor: + # Check that we can still connect to the PUB_DB_NAME + with endpoint.cursor(dbname=PUB_DB_NAME) as cursor: cursor.execute("SELECT * FROM current_database()") curr_db = cursor.fetchone() assert curr_db is not None assert len(curr_db) == 1 - assert curr_db[0] == "publisher_db" + assert curr_db[0] == PUB_DB_NAME -def test_compute_drop_role(neon_simple_env: NeonEnv): +def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: NeonEnv): """ Test that compute_ctl can drop a role even if it has some depending objects - like permissions in one of the databases. + like permissions in one of the databases that were granted by + neon_superuser. + Reproduction test for https://github.com/neondatabase/cloud/issues/13582 """ env = neon_simple_env TEST_DB_NAME = "db_with_permissions" + TEST_GRANTEE = "'); MALFORMED SQL $$ $x$ $/;5%$ %I" endpoint = env.endpoints.create_start("main") @@ -301,16 +385,18 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): cursor.execute("create view test_view as select * from test_table") with endpoint.cursor(dbname=TEST_DB_NAME, user="neon") as cursor: - cursor.execute("create role readonly") + cursor.execute(f'create role "{TEST_GRANTEE}"') # We (`compute_ctl`) make 'neon' the owner of schema 'public' in the owned database. # Postgres has all sorts of permissions and grants that we may not handle well, # but this is the shortest repro grant for the issue # https://github.com/neondatabase/cloud/issues/13582 - cursor.execute("grant select on all tables in schema public to readonly") + cursor.execute(f'grant select on all tables in schema public to "{TEST_GRANTEE}"') # Check that role was created with endpoint.cursor() as cursor: - cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'") + cursor.execute( + "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE} + ) role = cursor.fetchone() assert role is not None @@ -318,7 +404,8 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): # that may block our ability to drop the role. with endpoint.cursor(dbname=TEST_DB_NAME) as cursor: cursor.execute( - "select grantor from information_schema.role_table_grants where grantee = 'readonly'" + "select grantor from information_schema.role_table_grants where grantee = %(grantee)s", + {"grantee": TEST_GRANTEE}, ) res = cursor.fetchall() assert len(res) == 2, f"Expected 2 table grants, got {len(res)}" @@ -332,7 +419,7 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): "delta_operations": [ { "action": "delete_role", - "name": "readonly", + "name": TEST_GRANTEE, }, ], } @@ -341,7 +428,9 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): # Check that role is dropped with endpoint.cursor() as cursor: - cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'") + cursor.execute( + "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE} + ) role = cursor.fetchone() assert role is None @@ -370,3 +459,68 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly2'") role = cursor.fetchone() assert role is None + + +def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can drop a role if the role has previously been + granted table privileges by a role other than neon_superuser. + """ + TEST_DB_NAME = "neondb" + TEST_GRANTOR = "; RAISE EXCEPTION 'SQL injection detected;" + TEST_GRANTEE = "'$$; RAISE EXCEPTION 'SQL injection detected;'" + + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [ + { + # We need to create role via compute_ctl, because in this case it will receive + # additional grants equivalent to our real environment, so we can repro some + # issues. + "name": TEST_GRANTOR, + # Some autocomplete-suggested hash, no specific meaning. + "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", + "options": [], + }, + ], + "databases": [ + { + "name": TEST_DB_NAME, + "owner": TEST_GRANTOR, + }, + ], + }, + } + ) + + endpoint.reconfigure() + + with endpoint.cursor(dbname=TEST_DB_NAME, user=TEST_GRANTOR) as cursor: + cursor.execute(f'CREATE USER "{TEST_GRANTEE}"') + cursor.execute("CREATE TABLE test_table(id bigint)") + cursor.execute(f'GRANT ALL ON TABLE test_table TO "{TEST_GRANTEE}"') + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": TEST_GRANTEE, + }, + ], + } + ) + endpoint.reconfigure() + + with endpoint.cursor() as cursor: + cursor.execute( + "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE} + ) + role = cursor.fetchone() + assert role is None diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index b360162dc1..85cd065a2f 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -3,12 +3,13 @@ from __future__ import annotations import enum import os import shutil -import sys from enum import StrEnum from logging import debug from pathlib import Path from typing import TYPE_CHECKING, cast +# Docs are available at https://jsonnet.org/ref/bindings.html#python_api +import _jsonnet import pytest import requests import yaml @@ -92,10 +93,6 @@ def jsonnet_evaluate_file( ext_vars: str | dict[str, str] | None = None, tla_vars: str | dict[str, str] | None = None, ) -> str: - # Jsonnet doesn't support Python 3.13 yet - # Docs are available at https://jsonnet.org/ref/bindings.html#python_api - import _jsonnet - return cast( "str", _jsonnet.evaluate_file( @@ -130,7 +127,6 @@ class SqlExporterProcess(StrEnum): AUTOSCALING = "autoscaling" -@pytest.mark.xfail(sys.version_info >= (3, 13), reason="Jsonnet doesn't support Python 3.13 yet") @pytest.mark.parametrize( "collector_name", ["neon_collector", "neon_collector_autoscaling"], @@ -359,7 +355,6 @@ else: self.__proc.wait() -@pytest.mark.xfail(sys.version_info >= (3, 13), reason="Jsonnet doesn't support Python 3.13 yet") @pytest.mark.parametrize( "exporter", [SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING], diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 7f12c14073..30f8c65cbd 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -2,26 +2,26 @@ from __future__ import annotations import os import shutil -from contextlib import closing +import tarfile from pathlib import Path from typing import TYPE_CHECKING import pytest +import zstandard from fixtures.log_helper import log from fixtures.metrics import parse_metrics -from fixtures.neon_fixtures import ( - NeonEnvBuilder, -) -from fixtures.pg_version import PgVersion -from fixtures.utils import skip_on_postgres from pytest_httpserver import HTTPServer -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: from typing import Any from fixtures.httpserver import ListenAddress + from fixtures.neon_fixtures import ( + NeonEnvBuilder, + ) + from fixtures.pg_version import PgVersion + from werkzeug.wrappers.request import Request # use neon_env_builder_local fixture to override the default neon_env_builder fixture @@ -31,13 +31,13 @@ def neon_env_builder_local( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_distrib_dir: Path, - pg_version: PgVersion, ) -> NeonEnvBuilder: test_local_pginstall = test_output_dir / "pg_install" log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}") - shutil.copytree( - pg_distrib_dir / pg_version.v_prefixed, test_local_pginstall / pg_version.v_prefixed - ) + + # We can't copy only the version that we are currently testing because other + # binaries like the storage controller need specific Postgres versions. + shutil.copytree(pg_distrib_dir, test_local_pginstall) neon_env_builder.pg_distrib_dir = test_local_pginstall log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}") @@ -45,89 +45,92 @@ def neon_env_builder_local( return neon_env_builder -@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") -@skip_on_postgres(PgVersion.V17, reason="TODO: PG17 extension building") def test_remote_extensions( httpserver: HTTPServer, neon_env_builder_local: NeonEnvBuilder, httpserver_listen_address: ListenAddress, + test_output_dir: Path, + base_dir: Path, pg_version: PgVersion, ): - # setup mock http server - # that expects request for anon.tar.zst - # and returns the requested file + # Setup a mock nginx S3 gateway which will return our test extension. (host, port) = httpserver_listen_address extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway" build_tag = os.environ.get("BUILD_TAG", "latest") - archive_path = f"{build_tag}/v{pg_version}/extensions/anon.tar.zst" + archive_route = f"{build_tag}/v{pg_version}/extensions/test_extension.tar.zst" + tarball = test_output_dir / "test_extension.tar" + extension_dir = ( + base_dir / "test_runner" / "regress" / "data" / "test_remote_extensions" / "test_extension" + ) - def endpoint_handler_build_tag(request: Request) -> Response: + # Create tarball + with tarfile.open(tarball, "x") as tarf: + tarf.add( + extension_dir / "sql" / "test_extension--1.0.sql", + arcname="share/extension/test_extension--1.0.sql", + ) + tarf.add( + extension_dir / "sql" / "test_extension--1.0--1.1.sql", + arcname="share/extension/test_extension--1.0--1.1.sql", + ) + + def handler(request: Request) -> Response: log.info(f"request: {request}") - file_name = "anon.tar.zst" - file_path = f"test_runner/regress/data/extension_test/5670669815/v{pg_version}/extensions/anon.tar.zst" - file_size = os.path.getsize(file_path) - fh = open(file_path, "rb") + # Compress tarball + compressor = zstandard.ZstdCompressor() + with open(tarball, "rb") as f: + compressed_data = compressor.compress(f.read()) return Response( - fh, + compressed_data, mimetype="application/octet-stream", headers=[ - ("Content-Length", str(file_size)), - ("Content-Disposition", f'attachment; filename="{file_name}"'), + ("Content-Length", str(len(compressed_data))), ], direct_passthrough=True, ) httpserver.expect_request( - f"/pg-ext-s3-gateway/{archive_path}", method="GET" - ).respond_with_handler(endpoint_handler_build_tag) + f"/pg-ext-s3-gateway/{archive_route}", method="GET" + ).respond_with_handler(handler) # Start a compute node with remote_extension spec # and check that it can download the extensions and use them to CREATE EXTENSION. env = neon_env_builder_local.init_start() env.create_branch("test_remote_extensions") - endpoint = env.endpoints.create( - "test_remote_extensions", - config_lines=["log_min_messages=debug3"], - ) + endpoint = env.endpoints.create("test_remote_extensions") + + with open(extension_dir / "test_extension.control", encoding="utf-8") as f: + control_data = f.read() # mock remote_extensions spec spec: dict[str, Any] = { - "public_extensions": ["anon"], + "public_extensions": ["test_extension"], "custom_extensions": None, "library_index": { - "anon": "anon", + "test_extension": "test_extension", }, "extension_data": { - "anon": { + "test_extension": { "archive_path": "", "control_data": { - "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = 'Data anonymization tools'\ndefault_version = '1.1.0'\ndirectory='extension/anon'\nrelocatable = false\nrequires = 'pgcrypto'\nsuperuser = false\nmodule_pathname = '$libdir/anon'\ntrusted = true\n" + "test_extension.control": control_data, }, }, }, } - spec["extension_data"]["anon"]["archive_path"] = archive_path endpoint.create_remote_extension_spec(spec) - endpoint.start( - remote_ext_config=extensions_endpoint, - ) + endpoint.start(remote_ext_config=extensions_endpoint) - # this is expected to fail if there's no pgcrypto extension, that's ok - # we just want to check that the extension was downloaded - try: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - # Check that appropriate files were downloaded - cur.execute("CREATE EXTENSION anon") - res = [x[0] for x in cur.fetchall()] - log.info(res) - except Exception as err: - assert "pgcrypto" in str(err), f"unexpected error creating anon extension {err}" + with endpoint.connect() as conn: + with conn.cursor() as cur: + # Check that appropriate files were downloaded + cur.execute("CREATE EXTENSION test_extension VERSION '1.0'") + cur.execute("SELECT test_extension.motd()") httpserver.check() @@ -137,6 +140,48 @@ def test_remote_extensions( metrics = parse_metrics(raw_metrics) remote_ext_requests = metrics.query_all( "compute_ctl_remote_ext_requests_total", + # Check that we properly report the filename in the metrics + {"filename": "test_extension.tar.zst"}, + ) + assert len(remote_ext_requests) == 1 + for sample in remote_ext_requests: + assert sample.value == 1 + + endpoint.stop() + + # Remove the extension files to force a redownload of the extension. + for file in ( + "test_extension.control", + "test_extension--1.0.sql", + "test_extension--1.0--1.1.sql", + ): + ( + test_output_dir + / "pg_install" + / f"v{pg_version}" + / "share" + / "postgresql" + / "extension" + / file + ).unlink() + + endpoint.start(remote_ext_config=extensions_endpoint) + + # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions. + with endpoint.connect() as conn: + with conn.cursor() as cur: + # Check that appropriate files were downloaded + cur.execute("ALTER EXTENSION test_extension UPDATE TO '1.1'") + cur.execute("SELECT test_extension.fun_fact()") + + # Check that we properly recorded downloads in the metrics + client = endpoint.http_client() + raw_metrics = client.metrics() + metrics = parse_metrics(raw_metrics) + remote_ext_requests = metrics.query_all( + "compute_ctl_remote_ext_requests_total", + # Check that we properly report the filename in the metrics + {"filename": "test_extension.tar.zst"}, ) assert len(remote_ext_requests) == 1 for sample in remote_ext_requests: diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index ae2d171058..c8458b963e 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -6,9 +6,14 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.pageserver.http import PageserverHttpClient -def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): +def check_tenant( + env: NeonEnv, pageserver_http: PageserverHttpClient, safekeeper_proto_version: int +): tenant_id, timeline_id = env.create_tenant() - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + config_lines = [ + f"neon.safekeeper_proto_version = {safekeeper_proto_version}", + ] + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines) # we rely upon autocommit after each statement res_1 = endpoint.safe_psql_many( queries=[ @@ -33,7 +38,14 @@ def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): @pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)]) -def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int): +# Test both proto versions until we fully migrate. +@pytest.mark.parametrize("safekeeper_proto_version", [2, 3]) +def test_normal_work( + neon_env_builder: NeonEnvBuilder, + num_timelines: int, + num_safekeepers: int, + safekeeper_proto_version: int, +): """ Basic test: * create new tenant with a timeline @@ -52,4 +64,4 @@ def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_s pageserver_http = env.pageserver.http_client() for _ in range(num_timelines): - check_tenant(env, pageserver_http) + check_tenant(env, pageserver_http, safekeeper_proto_version) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 55fd7a8608..17ffeca23b 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -7,7 +7,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.utils import wait_until +from fixtures.utils import run_only_on_default_postgres, wait_until def check_client(env: NeonEnv, client: PageserverHttpClient): @@ -138,3 +138,25 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde with env.pageserver.http_client(auth_token=pageserver_token) as client: check_client(env, client) + + +@run_only_on_default_postgres("it does not use any postgres functionality") +def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + with env.pageserver.http_client() as client: + client.timeline_patch_index_part( + tenant_id, + timeline_id, + {"rel_size_migration": "migrating"}, + ) + assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "migrating" + # This is invalid in practice: we should never rollback the migrating state to legacy. + # But we do it here to test the API. + client.timeline_patch_index_part( + tenant_id, + timeline_id, + {"rel_size_migration": "legacy"}, + ) + assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy" diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index a9b897b741..9f2aa5df8c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -87,8 +87,8 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=s3_storage(), ) - neon_env_builder.control_plane_compute_hook_api = ( - f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + neon_env_builder.control_plane_hooks_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/" ) def ignore_notify(request: Request): @@ -938,9 +938,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # Expect lots of layers assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 - # Simulate large data by making layer downloads artifically slow for ps in env.pageservers: + # Simulate large data by making layer downloads artifically slow ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + # Make the initial logical size calculation lie. Otherwise it on demand downloads + # layers and makes accounting difficult. + ps.http_client().configure_failpoints(("skip-logical-size-calculation", "return")) def timeline_heatmap(tlid): assert env.pageserver_remote_storage is not None @@ -952,20 +955,16 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): raise RuntimeError(f"No heatmap for timeline: {tlid}") - # Upload a heatmap, so that secondaries have something to download - ps_attached.http_client().tenant_heatmap_upload(tenant_id) - heatmap_before_migration = timeline_heatmap(timeline_id) + def count_timeline_heatmap_layers(tlid) -> tuple[int, int]: + cold, hot = 0, 0 + layers = timeline_heatmap(tlid)["layers"] + for layer in layers: + if layer["cold"]: + cold += 1 + else: + hot += 1 - # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms. - # However, it pulls the heatmap, which will be important later. - http_client = env.storage_controller.pageserver_api() - (status, progress) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000) - assert status == 202 - assert progress["heatmap_mtime"] is not None - assert progress["layers_downloaded"] > 0 - assert progress["bytes_downloaded"] > 0 - assert progress["layers_total"] > progress["layers_downloaded"] - assert progress["bytes_total"] > progress["bytes_downloaded"] + return cold, hot env.storage_controller.allowed_errors.extend( [ @@ -975,8 +974,9 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # Use a custom configuration that gives up earlier than usual. # We can't hydrate everything anyway because of the failpoints. + # Implicitly, this also uploads a heatmap from the current attached location. config = StorageControllerMigrationConfig( - secondary_warmup_timeout="5s", secondary_download_request_timeout="2s" + secondary_warmup_timeout="5s", secondary_download_request_timeout="2s", prewarm=False ) env.storage_controller.tenant_shard_migrate( TenantShardId(tenant_id, shard_number=0, shard_count=0), ps_secondary.id, config @@ -988,31 +988,33 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_heatmap_upload(tenant_id) heatmap_after_migration = timeline_heatmap(timeline_id) - assert len(heatmap_before_migration["layers"]) > 0 + local_layers = ps_secondary.list_layers(tenant_id, timeline_id) + # We download 1 layer per second and give up within 5 seconds. + assert len(local_layers) < 10 after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"]) - assert len(heatmap_before_migration["layers"]) <= after_migration_heatmap_layers_count - log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}") env.storage_controller.download_heatmap_layers( TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id ) - # Now simulate the case where a child timeline is archived, parent layers - # are evicted and the child is unarchived. When the child is unarchived, - # itself and the parent update their heatmaps to contain layers needed by the - # child. One can warm up the timeline hierarchy since the heatmaps are ready. - - def all_layers_downloaded(expected_layer_count: int): - local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id)) + def all_layers_downloaded(node, expected_layer_count: int): + local_layers_count = len(node.list_layers(tenant_id, timeline_id)) log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") assert local_layers_count >= expected_layer_count - wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count)) - ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + def no_layers_downloaded(node): + local_layers_count = len(node.list_layers(tenant_id, timeline_id)) + log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") + assert local_layers_count == 0 + + wait_until(lambda: all_layers_downloaded(ps_secondary, after_migration_heatmap_layers_count)) + + # Read everything and make sure that we're not downloading anything extra. + # All hot layers should be available locally now. before = ( ps_secondary.http_client() .get_metrics() @@ -1030,6 +1032,11 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): workload.stop() assert before == after + # Now simulate the case where a child timeline is archived, parent layers + # are evicted and the child is unarchived. When the child is unarchived, + # itself and the parent update their heatmaps to contain layers needed by the + # child. One can warm up the timeline hierarchy since the heatmaps are ready. + def check_archival_state(state: TimelineArchivalState, tline): timelines = ( timeline["timeline_id"] @@ -1057,13 +1064,35 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): wait_until(lambda: check_archival_state(TimelineArchivalState.UNARCHIVED, child_timeline_id)) ps_secondary.http_client().tenant_heatmap_upload(tenant_id) - log.info(f"Parent timeline heatmap size: {len(timeline_heatmap(timeline_id)['layers'])}") - log.info(f"Child timeline heatmap size: {len(timeline_heatmap(child_timeline_id)['layers'])}") - expected_locally = len(timeline_heatmap(timeline_id)["layers"]) - assert expected_locally > 0 + parent_cold, parent_hot = count_timeline_heatmap_layers(timeline_id) + child_cold, child_hot = count_timeline_heatmap_layers(child_timeline_id) + + log.info(f"Parent timeline heatmap size: cold={parent_cold}, hot={parent_hot}") + log.info(f"Child timeline heatmap size: cold={child_cold}, hot={child_hot}") + + # All layers in the heatmap should come from the generation on unarchival. + # Hence, they should be cold. + assert parent_cold > 0 + assert parent_hot == 0 + + expected_locally = parent_cold env.storage_controller.download_heatmap_layers( - TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id + TenantShardId(tenant_id, shard_number=0, shard_count=0), child_timeline_id, recurse=True ) - wait_until(lambda: all_layers_downloaded(expected_locally)) + wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally)) + + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")]) + + # The uploaded heatmap is still empty. Clean up all layers on the secondary. + ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100) + wait_until(lambda: no_layers_downloaded(ps_attached)) + + # Upload a new heatmap. The previously cold layers become hot since they're now resident. + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + + # Warm up the current secondary. + ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100) + wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally)) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 411888efbc..1d9f385358 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,7 +5,7 @@ from __future__ import annotations from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Any, cast import pytest from fixtures.log_helper import log @@ -118,10 +118,20 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End pageserver.http_client().timeline_gc(shard, env.initial_timeline, None) +def patch_tenant_conf(tenant_conf: dict[str, Any], reldir_type: str) -> dict[str, Any]: + tenant_conf = tenant_conf.copy() + if reldir_type == "v2": + tenant_conf["rel_size_v2_enabled"] = "true" + else: + tenant_conf["rel_size_v2_enabled"] = "false" + return tenant_conf + + # Run the main PostgreSQL regression tests, in src/test/regress. # @pytest.mark.timeout(3000) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_pg_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -130,6 +140,7 @@ def test_pg_regress( base_dir: Path, pg_distrib_dir: Path, shard_count: int | None, + reldir_type: str, ): DBNAME = "regression" @@ -142,7 +153,7 @@ def test_pg_regress( neon_env_builder.enable_pageserver_remote_storage(s3_storage()) env = neon_env_builder.init_start( - initial_tenant_conf=TENANT_CONF, + initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type), initial_tenant_shard_count=shard_count, ) @@ -196,6 +207,7 @@ def test_pg_regress( # @pytest.mark.timeout(1500) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_isolation( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -204,6 +216,7 @@ def test_isolation( base_dir: Path, pg_distrib_dir: Path, shard_count: int | None, + reldir_type: str, ): DBNAME = "isolation_regression" @@ -211,7 +224,8 @@ def test_isolation( neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) env = neon_env_builder.init_start( - initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type), + initial_tenant_shard_count=shard_count, ) # Connect to postgres and create a database called "regression". @@ -267,6 +281,7 @@ def test_isolation( # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. @pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_sql_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -275,6 +290,7 @@ def test_sql_regress( base_dir: Path, pg_distrib_dir: Path, shard_count: int | None, + reldir_type: str, ): DBNAME = "regression" @@ -282,7 +298,8 @@ def test_sql_regress( neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) env = neon_env_builder.init_start( - initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type), + initial_tenant_shard_count=shard_count, ) # Connect to postgres and create a database called "regression". @@ -332,8 +349,10 @@ def test_sql_regress( @skip_in_debug_build("only run with release build") +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_tx_abort_with_many_relations( neon_env_builder: NeonEnvBuilder, + reldir_type: str, ): """ This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres @@ -342,7 +361,9 @@ def test_tx_abort_with_many_relations( Reproducer for https://github.com/neondatabase/neon/issues/9505 """ - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf=patch_tenant_conf({}, reldir_type), + ) ep = env.endpoints.create_start( "main", tenant_id=env.initial_tenant, @@ -352,50 +373,78 @@ def test_tx_abort_with_many_relations( ], ) + if reldir_type == "v1": + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + else: + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + != "legacy" + ) + # How many relations: this number is tuned to be long enough to take tens of seconds # if the rollback code path is buggy, tripping the test's timeout. - n = 4000 + n = 5000 + step = 2500 def create(): # Create many relations log.info(f"Creating {n} relations...") - ep.safe_psql_many( - [ - "BEGIN", - f"""DO $$ - DECLARE - i INT; - table_name TEXT; - BEGIN - FOR i IN 1..{n} LOOP - table_name := 'table_' || i; - EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; - END LOOP; - END $$; - """, - "COMMIT", - ] - ) + begin = 0 + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; + END LOOP; + END $$; + """, + "COMMIT", + ] + ) + begin = end + if begin >= n: + break def truncate(): # Truncate relations, then roll back the transaction containing the truncations log.info(f"Truncating {n} relations...") - ep.safe_psql_many( - [ - "BEGIN", - f"""DO $$ - DECLARE - i INT; - table_name TEXT; - BEGIN - FOR i IN 1..{n} LOOP - table_name := 'table_' || i; - EXECUTE 'TRUNCATE ' || table_name ; - END LOOP; - END $$; - """, - ] - ) + begin = 0 + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'TRUNCATE ' || table_name ; + END LOOP; + END $$; + """, + ] + ) + begin = end + if begin >= n: + break def rollback_and_wait(): log.info(f"Rolling back after truncating {n} relations...") diff --git a/test_runner/regress/test_pgstat.py b/test_runner/regress/test_pgstat.py index c31e5ef7f8..bf9b982e14 100644 --- a/test_runner/regress/test_pgstat.py +++ b/test_runner/regress/test_pgstat.py @@ -13,7 +13,7 @@ def test_pgstat(neon_simple_env: NeonEnv): n = 10000 endpoint = env.endpoints.create_start( - "main", config_lines=["neon_pgstat_file_size_limit=100kB", "autovacuum=off"] + "main", config_lines=["neon.pgstat_file_size_limit=100kB", "autovacuum=off"] ) con = endpoint.connect() diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py index 3e29c92a96..07eacfc775 100644 --- a/test_runner/regress/test_relations.py +++ b/test_runner/regress/test_relations.py @@ -19,6 +19,17 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)") endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)") + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + + # Ensure the pageserver accepts the table creation SQLs before the migration. In theory, we can also do + # a "wait_flush_lsn" here, but it's easier to just do a restart. + env.pageserver.restart() + # Switch to v2 env.pageserver.http_client().update_tenant_config( env.initial_tenant, @@ -27,6 +38,13 @@ def test_pageserver_reldir_v2( }, ) + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + # Check if both relations are still accessible endpoint.safe_psql("SELECT * FROM foo1") endpoint.safe_psql("SELECT * FROM foo2") @@ -41,12 +59,14 @@ def test_pageserver_reldir_v2( # Create a relation in v2 endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)") + endpoint.safe_psql("CREATE TABLE foo4 (id INTEGER PRIMARY KEY, val text)") # Delete a relation in v1 endpoint.safe_psql("DROP TABLE foo1") # Check if both relations are still accessible endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("SELECT * FROM foo3") + endpoint.safe_psql("SELECT * FROM foo4") # Restart the endpoint endpoint.stop() @@ -57,7 +77,7 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("DROP TABLE IF EXISTS foo1") endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("SELECT * FROM foo3") - + endpoint.safe_psql("SELECT * FROM foo4") endpoint.safe_psql("DROP TABLE foo3") endpoint.stop() endpoint.start() @@ -66,3 +86,25 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("DROP TABLE IF EXISTS foo1") endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("DROP TABLE IF EXISTS foo3") + endpoint.safe_psql("SELECT * FROM foo4") + + # Set the config to false to emulate the case where the config is not persisted when the tenant gets detached/attached. + env.pageserver.http_client().update_tenant_config( + env.initial_tenant, + { + "rel_size_v2_enabled": False, + }, + ) + + # Check if the relation is still accessible + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo4") + + env.pageserver.restart() + + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "migrating" + ) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index c39c74fa2a..e8721f1ea0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -29,7 +29,6 @@ from fixtures.remote_storage import ( from fixtures.utils import ( assert_eq, assert_ge, - assert_gt, print_gc_result, query_scalar, wait_until, @@ -334,14 +333,12 @@ def test_remote_storage_upload_queue_retries( # Exponential back-off in upload queue, so, gracious timeouts. wait_until( - lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30 + lambda: assert_ge(get_queued_count(file_kind="layer", op_kind="upload"), 1), timeout=30 ) wait_until( lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30 ) - wait_until( - lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30 - ) + # There may or may not be deletes queued up behind conflicting uploads; don't check. # unblock churn operations configure_storage_sync_failpoints("off") @@ -786,54 +783,6 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv create_thread.join() -def test_paused_upload_stalls_checkpoint( - neon_env_builder: NeonEnvBuilder, -): - """ - This test checks that checkpoints block on uploads to remote storage. - """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start( - initial_tenant_conf={ - # Set a small compaction threshold - "compaction_threshold": "3", - # Disable GC - "gc_period": "0s", - # disable PITR - "pitr_interval": "0s", - } - ) - - env.pageserver.allowed_errors.append( - f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" - ) - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - client = env.pageserver.http_client() - layers_at_creation = client.layer_map_info(tenant_id, timeline_id) - deltas_at_creation = len(layers_at_creation.delta_layers()) - assert ( - deltas_at_creation == 1 - ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle" - - # Make new layer uploads get stuck. - # Note that timeline creation waits for the initial layers to reach remote storage. - # So at this point, the `layers_at_creation` are in remote storage. - client.configure_failpoints(("before-upload-layer-pausable", "pause")) - - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - # Build two tables with some data inside - endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - - with pytest.raises(ReadTimeout): - client.timeline_checkpoint(tenant_id, timeline_id, timeout=5) - client.configure_failpoints(("before-upload-layer-pausable", "off")) - - def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): diff --git a/test_runner/regress/test_safekeeper_deletion.py b/test_runner/regress/test_safekeeper_deletion.py new file mode 100644 index 0000000000..b46095d583 --- /dev/null +++ b/test_runner/regress/test_safekeeper_deletion.py @@ -0,0 +1,331 @@ +from __future__ import annotations + +import threading +import time +from contextlib import closing +from enum import StrEnum + +import pytest +import requests +from fixtures.common_types import Lsn, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnvBuilder, +) +from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.safekeeper_utils import is_segment_offloaded +from fixtures.utils import wait_until + + +@pytest.mark.parametrize("auth_enabled", [False, True]) +def test_safekeeper_delete_timeline(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.auth_enabled = auth_enabled + env = neon_env_builder.init_start() + + # FIXME: are these expected? + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found in global map.*", + ".*Timeline .* was cancelled and cannot be used anymore.*", + ] + ) + + # Create two tenants: one will be deleted, other should be preserved. + tenant_id = env.initial_tenant + timeline_id_1 = env.create_branch("br1") # Active, delete explicitly + timeline_id_2 = env.create_branch("br2") # Inactive, delete explicitly + timeline_id_3 = env.create_branch("br3") # Active, delete with the tenant + timeline_id_4 = env.create_branch("br4") # Inactive, delete with the tenant + + tenant_id_other, timeline_id_other = env.create_tenant() + + # Populate branches + endpoint_1 = env.endpoints.create_start("br1") + endpoint_2 = env.endpoints.create_start("br2") + endpoint_3 = env.endpoints.create_start("br3") + endpoint_4 = env.endpoints.create_start("br4") + endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other) + for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key)") + sk = env.safekeepers[0] + sk_data_dir = sk.data_dir + if not auth_enabled: + sk_http = sk.http_client() + sk_http_other = sk_http + else: + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + sk_http_other = sk.http_client( + auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) + ) + sk_http_noauth = sk.http_client(gen_sk_wide_token=False) + assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. + endpoint_2.stop_and_destroy() + endpoint_4.stop_and_destroy() + sk.stop() + sk.start() + + # Ensure connections to Safekeeper are established + for endpoint in [endpoint_1, endpoint_3, endpoint_other]: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("INSERT INTO t (key) VALUES (1)") + + # Stop all computes gracefully before safekeepers stop responding to them + endpoint_1.stop_and_destroy() + endpoint_3.stop_and_destroy() + + # Remove initial tenant's br1 (active) + assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Ensure repeated deletion succeeds + assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + if auth_enabled: + # Ensure we cannot delete the other tenant + for sk_h in [sk_http, sk_http_noauth]: + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): + assert sk_h.timeline_delete(tenant_id_other, timeline_id_other) + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): + assert sk_h.tenant_delete_force(tenant_id_other) + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant's br2 (inactive) + assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove non-existing branch, should succeed + assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant fully (two branches are active) + response = sk_http.tenant_delete_force(tenant_id) + assert response[str(timeline_id_3)]["dir_existed"] + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant again. + response = sk_http.tenant_delete_force(tenant_id) + # assert response == {} + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Ensure the other tenant still works + sk_http_other.timeline_status(tenant_id_other, timeline_id_other) + with closing(endpoint_other.connect()) as conn: + with conn.cursor() as cur: + cur.execute("INSERT INTO t (key) VALUES (123)") + + +def test_safekeeper_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder): + """ + Test deleting timelines on a safekeeper while they're under load. + + This should not happen under normal operation, but it can happen if + there is some rogue compute/pageserver that is writing/reading to a + safekeeper that we're migrating a timeline away from, or if the timeline + is being deleted while such a rogue client is running. + """ + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + + # Create two endpoints that will generate load + timeline_id_a = env.create_branch("deleteme_a") + timeline_id_b = env.create_branch("deleteme_b") + + endpoint_a = env.endpoints.create("deleteme_a") + endpoint_a.start() + endpoint_b = env.endpoints.create("deleteme_b") + endpoint_b.start() + + # Get tenant and timeline IDs + tenant_id = env.initial_tenant + + # Start generating load on both timelines + def generate_load(endpoint: Endpoint): + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") + while True: + try: + cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'") + except: # noqa + # Ignore errors since timeline may be deleted + break + + t_a = threading.Thread(target=generate_load, args=(endpoint_a,)) + t_b = threading.Thread(target=generate_load, args=(endpoint_b,)) + try: + t_a.start() + t_b.start() + + # Let the load run for a bit + log.info("Warming up...") + time.sleep(2) + + # Safekeeper errors will propagate to the pageserver: it is correct that these are + # logged at error severity because they indicate the pageserver is trying to read + # a timeline that it shouldn't. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline.*was cancelled.*", + ".*Timeline.*was not found.*", + ] + ) + + # Try deleting timelines while under load + sk = env.safekeepers[0] + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + + # Delete first timeline + log.info(f"Deleting {timeline_id_a}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"] + + # Delete second timeline + log.info(f"Deleting {timeline_id_b}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"] + + # Verify timelines are gone from disk + sk_data_dir = sk.data_dir + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists() + # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists() + + finally: + log.info("Stopping endpoints...") + # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang + endpoint_a.stop(mode="immediate") + endpoint_b.stop(mode="immediate") + log.info("Joining threads...") + t_a.join() + t_b.join() + + +class RemoteDeleteFailpoint(StrEnum): + PAUSE = "sk-delete-timeline-remote-pause" + FAIL = "sk-delete-timeline-remote" + + +@pytest.mark.parametrize("failpoint", [RemoteDeleteFailpoint.PAUSE, RemoteDeleteFailpoint.FAIL]) +def test_safekeeper_delete_remote_errors( + neon_env_builder: NeonEnvBuilder, failpoint: RemoteDeleteFailpoint +): + """ + Test that errors and delays during remote deletion are handled correctly. + """ + + # Configure safekeepers with ultra-fast eviction policy + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--control-file-save-interval", + "1s", + ] + neon_env_builder.enable_safekeeper_remote_storage(s3_storage()) + env = neon_env_builder.init_start() + + # FIXME: pageserver is intermittently emitting this + env.pageserver.allowed_errors.extend( + [ + ".*unsupported command START_WAL_PUSH in START_WAL_PUSH.*", + ] + ) + + timeline_id_a = env.create_branch("deleteme_a") + endpoint_a = env.endpoints.create("deleteme_a") + endpoint_a.start() + with closing(endpoint_a.connect()) as conn: + with conn.cursor() as cur: + # roughly fills one segment + cur.execute("create table t(key int, value text)") + cur.execute("insert into t select generate_series(1,250000), 'payload'") + endpoint_a.stop() + + # Ensure something is uploaded to remote storage + def assert_is_uploaded(): + assert is_segment_offloaded( + env.safekeepers[0], env.initial_tenant, timeline_id_a, Lsn("0/2000000") + ) + + wait_until(assert_is_uploaded) + + def list_timeline_remote(): + assert isinstance(env.safekeepers_remote_storage, S3Storage) + prefix = f"{env.safekeepers_remote_storage.safekeeper_timeline_path(env.initial_tenant, timeline_id_a)}/" + + listing = env.safekeepers_remote_storage.client.list_objects_v2( + Bucket=env.safekeepers_remote_storage.bucket_name, + Prefix=prefix, + ) + return listing.get("Contents", []) + + assert list_timeline_remote() != [] + + sk_http = env.safekeepers[0].http_client() + env.pageserver.http_client().timeline_delete(env.initial_tenant, timeline_id_a) + + # Set up failpoint + if failpoint == RemoteDeleteFailpoint.PAUSE: + sk_http.configure_failpoints((failpoint, "pause")) + elif failpoint == RemoteDeleteFailpoint.FAIL: + sk_http.configure_failpoints((failpoint, "return")) + else: + raise NotImplementedError(f"Unknown failpoint: {failpoint}") + + # Delete the timeline - this should hit the configured failpoint + if failpoint == RemoteDeleteFailpoint.PAUSE: + # Expect time out + with pytest.raises(requests.exceptions.ReadTimeout, match="timed out"): + sk_http.timeline_delete(env.initial_tenant, timeline_id_a, timeout=5) + + # Assert deletion didn't happy yet + assert list_timeline_remote() != [] + + # Unblock the background task that should still be running + sk_http.configure_failpoints((failpoint, "off")) + + # Expect that after unblocking, remote deletion proceeds + def assert_remote_deleted(): + assert list_timeline_remote() == [] + + wait_until(assert_remote_deleted) + + elif failpoint == RemoteDeleteFailpoint.FAIL: + # Expect immediate failure + with pytest.raises(sk_http.HTTPError, match="Internal Server Error"): + sk_http.timeline_delete(env.initial_tenant, timeline_id_a) + + sk_http.configure_failpoints((failpoint, "off")) + else: + raise NotImplementedError(f"Unknown failpoint: {failpoint}") + + # Retry should succeed + sk_http.timeline_delete(env.initial_tenant, timeline_id_a) + + # Remote storage should be empty + assert list_timeline_remote() == [] diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index f58bbcd3c0..b98ac8e50a 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -794,7 +794,7 @@ def test_sharding_split_stripe_size( Check that modifying stripe size inline with a shard split works as expected """ (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" neon_env_builder.num_pageservers = 1 # Set up fake HTTP notify endpoint: we will use this to validate that we receive @@ -806,7 +806,7 @@ def test_sharding_split_stripe_size( notifications.append(request.json) return Response(status=200) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) env = neon_env_builder.init_start( initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size @@ -1312,9 +1312,7 @@ def test_sharding_split_failures( failure: Failure, ): neon_env_builder.num_pageservers = 4 - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api initial_shard_count = 2 split_shard_count = 4 @@ -1814,14 +1812,3 @@ def test_sharding_gc( shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}") assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn - - for ps in env.pageservers: - # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by - # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. - # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed - ps.allowed_errors.extend( - [ - ".*could not find data for key.*", - ".*could not ingest record.*", - ] - ) diff --git a/test_runner/regress/test_signed_char.py b/test_runner/regress/test_signed_char.py new file mode 100644 index 0000000000..8752a1ff3f --- /dev/null +++ b/test_runner/regress/test_signed_char.py @@ -0,0 +1,64 @@ +from pathlib import Path + +from fixtures.neon_fixtures import NeonEnv + +SIGNED_CHAR_EXTRACT = """ + WITH + -- Generates an intermediate table with block numbers of the index + pagenumbers AS ( + SELECT num FROM generate_series(0, (pg_relation_size('test_payload_idx') / 8192) - 1) it(num) + ) + SELECT num, + -- Gets the data of the page, skipping the first 8 bytes which is the LSN + substr(page, 9, 8192-8), + -- Returns information about the GIN index opaque area + (gin_page_opaque_info(page)).* + FROM pagenumbers, + -- Gets a page from the respective blocks of the table + LATERAL (SELECT get_raw_page('test_payload_idx', num)) AS p(page) + -- Filters to only return leaf pages from the GIN Index + WHERE ARRAY['leaf'] = ((gin_page_opaque_info(page)).flags); + """ + + +def test_signed_char(neon_simple_env: NeonEnv): + """ + Test that postgres was compiled with -fsigned-char. + --- + In multi-character keys, the GIN index creates a CRC Hash of the first 3 bytes of the key. + The hash can have the first bit to be set or unset, needing to have a consistent representation + of char across architectures for consistent results. GIN stores these keys by their hashes + which determines the order in which the keys are obtained from the GIN index. + Using -fsigned-char enforces this order across platforms making this consistent. + The following query gets all the data present in the leaf page of a GIN index, + which is ordered by the CRC hash and is consistent across platforms. + """ + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + + with endpoint.connect().cursor() as ses1: + # Add the required extensions + ses1.execute("CREATE EXTENSION pg_trgm;") + ses1.execute("CREATE EXTENSION pageinspect;") + # Create a test table + ses1.execute("CREATE TABLE test (payload text);") + # Create a GIN based index + ses1.execute( + "CREATE INDEX test_payload_idx ON test USING gin (payload gin_trgm_ops) WITH (gin_pending_list_limit = 64);" + ) + # insert a multibyte character to trigger order-dependent hashing + ses1.execute( + "INSERT INTO test SELECT '123456789BV' || CHR(127153) /* ace of spades, a multibyte character */ || i::text from generate_series(1, 40) as i(i);" + ) + ses1.execute("INSERT INTO test SELECT 'Bóbr';") + # Clean pending list to flush data to pages + ses1.execute("select gin_clean_pending_list('test_payload_idx'::regclass);") + ses1.execute(SIGNED_CHAR_EXTRACT) + pages = ses1.fetchall() + # Compare expected output + page1 = pages[0] + data = bytes(page1[1]).hex() + with open(Path(__file__).parent / "data" / "test_signed_char.out", encoding="utf-8") as f: + expected = f.read().rstrip() + + assert data == expected diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py new file mode 100644 index 0000000000..7db4a16f49 --- /dev/null +++ b/test_runner/regress/test_ssl.py @@ -0,0 +1,68 @@ +import pytest +import requests +from fixtures.neon_fixtures import NeonEnvBuilder, StorageControllerApiException +from fixtures.utils import wait_until + + +def test_pageserver_https_api(neon_env_builder: NeonEnvBuilder): + """ + Test HTTPS pageserver management API. + If NeonEnv starts with use_https_pageserver_api with no errors, it's already a success. + Make /v1/status request to HTTPS API to ensure it's appropriately configured. + """ + neon_env_builder.use_https_pageserver_api = True + env = neon_env_builder.init_start() + + addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + +def test_safekeeper_https_api(neon_env_builder: NeonEnvBuilder): + """ + Test HTTPS safekeeper management API. + 1. Make /v1/status request to HTTPS API to ensure it's appropriately configured. + 2. Try to register safekeeper in storcon with https port missing. + 3. Register safekeeper with https port. + 4. Wait for a heartbeat round to complete. + """ + neon_env_builder.use_https_safekeeper_api = True + env = neon_env_builder.init_start() + + sk = env.safekeepers[0] + + # 1. Make simple https request. + addr = f"https://localhost:{sk.port.https}/v1/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + # Note: http_port is intentionally wrong. + # Storcon should not use it if use_https is on. + http_port = 0 + + body = { + "active": True, + "id": sk.id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "localhost", + "port": sk.port.pg, + "http_port": http_port, + "https_port": None, + "version": 5957, + "availability_zone_id": "us-east-2b", + } + # 2. Try register with https port missing. + with pytest.raises(StorageControllerApiException, match="https port is not specified"): + env.storage_controller.on_safekeeper_deploy(sk.id, body) + + # 3. Register with https port. + body["https_port"] = sk.port.https + env.storage_controller.on_safekeeper_deploy(sk.id, body) + + # 4. Wait for hearbeat round complete. + def storcon_heartbeat(): + assert env.storage_controller.log_contains( + "Heartbeat round complete for 1 safekeepers, 0 offline" + ) + + wait_until(storcon_heartbeat) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index d5acc257b2..05eb4301b0 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -26,6 +26,7 @@ from fixtures.neon_fixtures import ( PgBin, StorageControllerApiException, StorageControllerLeadershipStatus, + StorageControllerMigrationConfig, last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient @@ -604,7 +605,7 @@ def test_storage_controller_compute_hook( # when migrating. neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" # Set up fake HTTP notify endpoint notifications = [] @@ -617,7 +618,7 @@ def test_storage_controller_compute_hook( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -723,7 +724,7 @@ def test_storage_controller_stuck_compute_hook( neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" handle_params = {"status": 200} @@ -735,7 +736,7 @@ def test_storage_controller_stuck_compute_hook( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -765,7 +766,10 @@ def test_storage_controller_stuck_compute_hook( # status is cleared. handle_params["status"] = 423 migrate_fut = executor.submit( - env.storage_controller.tenant_shard_migrate, shard_0_id, dest_ps_id + env.storage_controller.tenant_shard_migrate, + shard_0_id, + dest_ps_id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), ) def logged_stuck(): @@ -793,7 +797,10 @@ def test_storage_controller_stuck_compute_hook( # Now, do a migration in the opposite direction handle_params["status"] = 423 migrate_fut = executor.submit( - env.storage_controller.tenant_shard_migrate, shard_0_id, origin_pageserver.id + env.storage_controller.tenant_shard_migrate, + shard_0_id, + origin_pageserver.id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), ) def logged_stuck_again(): @@ -864,7 +871,7 @@ def test_storage_controller_compute_hook_retry( neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" handle_params = {"status": 200} @@ -876,7 +883,7 @@ def test_storage_controller_compute_hook_retry( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_configs() @@ -986,7 +993,7 @@ def test_storage_controller_compute_hook_revert( # when migrating. neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" # Set up fake HTTP notify endpoint notifications = [] @@ -999,7 +1006,7 @@ def test_storage_controller_compute_hook_revert( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -1027,7 +1034,11 @@ def test_storage_controller_compute_hook_revert( with pytest.raises(StorageControllerApiException, match="Timeout waiting for shard"): # We expect the controller to give us an error because its reconciliation timed out # waiting for the compute hook. - env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_b.id) + env.storage_controller.tenant_shard_migrate( + tenant_shard_id, + pageserver_b.id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), + ) # Although the migration API failed, the hook should still see pageserver B (it remembers what # was posted even when returning an error code) @@ -1068,7 +1079,11 @@ def test_storage_controller_compute_hook_revert( # Migrate B -> A, with a working compute hook: the controller should notify the hook because the # last update it made that was acked (423) by the compute was for node B. handle_params["status"] = 200 - env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id) + env.storage_controller.tenant_shard_migrate( + tenant_shard_id, + pageserver_a.id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), + ) wait_until(lambda: notified_ps(pageserver_a.id)) @@ -1380,9 +1395,7 @@ def test_storage_controller_tenant_deletion( """ neon_env_builder.num_pageservers = 4 neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api env = neon_env_builder.init_configs() env.start() @@ -1734,18 +1747,23 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): # Restart the failed pageserver victim_ps.start() + env.storage_controller.reconcile_until_idle() + # We expect that the re-attach call correctly tipped off the pageserver that its locations # are all secondaries now. locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"] assert len(locations) == 2 assert all(loc[1]["mode"] == "Secondary" for loc in locations) - # We expect that this situation resulted from the re_attach call, and not any explicit - # Reconciler runs: assert that the reconciliation count has not gone up since we restarted. + # We expect that this situation resulted from background reconciliations + # Reconciler runs: assert that the reconciliation count has gone up by exactly + # one for each shard reconciles_after_restart = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) - assert reconciles_after_restart == reconciles_before_restart + + assert reconciles_before_restart is not None + assert reconciles_after_restart == reconciles_before_restart + 2 def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder): @@ -1949,6 +1967,9 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_attached"] ) ), + # A simple migration where we will ignore scheduling (force=true) and do it immediately (prewarm=false) + "--prewarm=false", + "--override-scheduler=true", ] ) @@ -3208,6 +3229,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): "host": "localhost", "port": sk_0.port.pg, "http_port": sk_0.port.http, + "https_port": None, "version": 5957, "availability_zone_id": "us-east-2b", } @@ -3242,6 +3264,24 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert eq_safekeeper_records(body, inserted_now) + # https_port appears during migration + body["https_port"] = 123 + target.on_safekeeper_deploy(fake_id, body) + inserted_now = target.get_safekeeper(fake_id) + assert target.get_safekeepers() == [inserted_now] + assert inserted_now is not None + assert eq_safekeeper_records(body, inserted_now) + env.storage_controller.consistency_check() + + # https_port rollback + body["https_port"] = None + target.on_safekeeper_deploy(fake_id, body) + inserted_now = target.get_safekeeper(fake_id) + assert target.get_safekeepers() == [inserted_now] + assert inserted_now is not None + assert eq_safekeeper_records(body, inserted_now) + env.storage_controller.consistency_check() + # some small tests for the scheduling policy querying and returning APIs newest_info = target.get_safekeeper(inserted["id"]) assert newest_info @@ -3774,6 +3814,7 @@ def test_storage_controller_node_flap_detach_race( wait_until(validate_locations, timeout=10) +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder): """ Check that storage controller handles node_register requests with updated fields correctly. @@ -3865,3 +3906,108 @@ def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvB ) assert reconciles_after_restart == 0 + + +@pytest.mark.parametrize("wrong_az", [True, False]) +def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder, wrong_az: bool): + """ + Test that the graceful migration API goes through the process of + creating a secondary & waiting for it to warm up before cutting over, when + we use the prewarm=True flag to the API. + """ + + # 2 pageservers in 2 AZs, so that each AZ has a pageserver we can migrate to + neon_env_builder.num_pageservers = 4 + neon_env_builder.num_azs = 2 + + env = neon_env_builder.init_start() + + # Enable secondary location (neon_local disables by default) + env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}}) + env.storage_controller.reconcile_until_idle() + + initial_desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0] + initial_ps_id = initial_desc["node_attached"] + initial_secondary_id = initial_desc["node_secondary"][0] + initial_ps_az = initial_desc["preferred_az_id"] + initial_ps = [ps for ps in env.pageservers if ps.id == initial_ps_id][0] + + if wrong_az: + dest_ps = [ + ps + for ps in env.pageservers + if ps.id != initial_ps_id + and ps.az_id != initial_ps_az + and ps.id != initial_secondary_id + ][0] + else: + dest_ps = [ + ps + for ps in env.pageservers + if ps.id != initial_ps_id + and ps.az_id == initial_ps_az + and ps.id != initial_secondary_id + ][0] + + log.info( + f"Migrating to {dest_ps.id} in AZ {dest_ps.az_id} (from {initial_ps_id} in AZ {initial_ps_az})" + ) + dest_ps_id = dest_ps.id + + # Set a failpoint so that the migration will block at the point it has a secondary location + for ps in env.pageservers: + ps.http_client().configure_failpoints(("secondary-layer-download-pausable", "pause")) + + # Before migration, our destination has no locations. Guaranteed because any secondary for our + # tenant will be in another AZ. + assert dest_ps.http_client().tenant_list_locations()["tenant_shards"] == [] + + if wrong_az: + # If migrating to the wrong AZ, first check that omitting force flag results in rejection + with pytest.raises(StorageControllerApiException, match="worse-scoring node"): + env.storage_controller.tenant_shard_migrate( + TenantShardId(env.initial_tenant, 0, 0), + dest_ps_id, + config=StorageControllerMigrationConfig(prewarm=True, override_scheduler=False), + ) + + # Turn off ordinary optimisations so that our migration will stay put once complete + env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Essential"}) + + # We expect this API call to succeed, and result in a new secondary location on the destination + env.storage_controller.tenant_shard_migrate( + TenantShardId(env.initial_tenant, 0, 0), + dest_ps_id, + config=StorageControllerMigrationConfig(prewarm=True, override_scheduler=wrong_az), + ) + + def secondary_at_dest(): + locs = dest_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(locs) == 1 + assert locs[0][0] == str(env.initial_tenant) + assert locs[0][1]["mode"] == "Secondary" + + wait_until(secondary_at_dest) + + # Unblock secondary downloads + for ps in env.pageservers: + ps.http_client().configure_failpoints(("secondary-layer-download-pausable", "off")) + + # Pump the reconciler to avoid waiting for background reconciles + env.storage_controller.reconcile_until_idle() + + # We should be attached at the destination + locs = dest_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(locs) == 1 + assert locs[0][1]["mode"] == "AttachedSingle" + + # Nothing left behind at the origin + if wrong_az: + # We're in essential scheduling mode, so the end state should be attached in the migration + # destination and a secondary in the original location + assert ( + initial_ps.http_client().tenant_list_locations()["tenant_shards"][0][1]["mode"] + == "Secondary" + ) + else: + assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == [] diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index d44c176b35..0f4e5688a9 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -312,17 +312,6 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ drop_local_state(env, tenant_id) workload.validate() - for ps in env.pageservers: - # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by - # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. - # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed - ps.allowed_errors.extend( - [ - ".*could not find data for key.*", - ".*could not ingest record.*", - ] - ) - def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 713f89c60f..81e727a3aa 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -436,7 +436,7 @@ def test_single_branch_get_tenant_size_grows( # when our tenant is configured with a tiny pitr interval, dropping a table should # cause synthetic size to go down immediately tenant_config["pitr_interval"] = "0s" - env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config) + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, tenant_config) (current_lsn, size) = get_current_consistent_size( env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index c17840d31c..2bad0bb671 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -42,6 +42,17 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): # If we run the unsharded version, talk to the storage controller ps_http = env.storage_controller.pageserver_api() + for ps in env.pageservers: + # We make /archival_config requests that are intended to fail. + # It's expected that storcon drops requests to other pageservers after + # it gets the first error (https://github.com/neondatabase/neon/issues/11177) + ps.allowed_errors.extend( + [ + ".*WARN.* path=/v1/tenant/.*/archival_config .*request was dropped before completing", + ".*ERROR.* path=/v1/tenant/.*/archival_config .*Cancelled request finished with an error.*", + ] + ) + # first try to archive a non existing timeline for an existing tenant: invalid_timeline_id = TimelineId.generate() with pytest.raises(PageserverApiException, match="timeline not found") as exc: diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 612a767480..685a32af90 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -319,8 +319,9 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): # this does not contain Z in the end, so fromisoformat accepts it # it is to be in line with the deletion timestamp.. well, almost. when = original_ancestor[2][:26] - when_ts = datetime.datetime.fromisoformat(when) - assert when_ts < datetime.datetime.now() + when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC) + now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC) + assert when_ts < now assert len(lineage.get("reparenting_history", [])) == 0 elif expected_ancestor == timeline_id: assert len(lineage.get("original_ancestor", [])) == 0 @@ -342,6 +343,140 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) +def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): + """ + Test the v2 behavior of ancestor detach. + + old main -------|---------X---------> + | | | + | | +-> after + | +--X empty snapshot branch + | | + | +-> branch-to-detach + | + +-> earlier + + Ends up as: + + old main -------|---------X---------> + | | | + | | +-> after + | +--X empty snapshot branch + | + +-> earlier + + + new main -------|---------|----> branch-to-detach + """ + + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + + branchpoint_pipe = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + earlier = env.create_branch( + "earlier", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_pipe + ) + + snapshot_branchpoint = env.create_branch( + "snapshot_branchpoint", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_x + ) + + branch_to_detach = env.create_branch( + "branch_to_detach", + ancestor_branch_name="snapshot_branchpoint", + ancestor_start_lsn=branchpoint_x, + ) + + after = env.create_branch("after", ancestor_branch_name="main", ancestor_start_lsn=None) + + all_reparented = client.detach_ancestor( + env.initial_tenant, branch_to_detach, detach_behavior="v2" + ) + assert set(all_reparented) == set() + + env.pageserver.quiesce_tenants() + + # checking the ancestor after is much faster than waiting for the endpoint not start + expected_result = [ + ("main", env.initial_timeline, None, 16384, 1), + ("after", after, env.initial_timeline, 16384, 1), + ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 8192, 1), + ("branch_to_detach", branch_to_detach, None, 8192, 1), + ("earlier", earlier, env.initial_timeline, 0, 1), + ] + + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + for branch_name, queried_timeline, expected_ancestor, _, _ in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) + ancestor_timeline_id = details["ancestor_timeline_id"] + if expected_ancestor is None: + assert ancestor_timeline_id is None + else: + assert ( + TimelineId(ancestor_timeline_id) == expected_ancestor + ), f"when checking branch {branch_name}, mapping={expected_result}" + + index_part = env.pageserver_remote_storage.index_content( + env.initial_tenant, queried_timeline + ) + lineage = index_part["lineage"] + assert lineage is not None + + assert lineage.get("reparenting_history_overflown", "false") == "false" + + if queried_timeline == branch_to_detach: + original_ancestor = lineage["original_ancestor"] + assert original_ancestor is not None + assert original_ancestor[0] == str(env.initial_timeline) + assert original_ancestor[1] == str(branchpoint_x) + + # this does not contain Z in the end, so fromisoformat accepts it + # it is to be in line with the deletion timestamp.. well, almost. + when = original_ancestor[2][:26] + when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC) + now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC) + assert when_ts < now + assert len(lineage.get("reparenting_history", [])) == 0 + elif expected_ancestor == branch_to_detach: + assert len(lineage.get("original_ancestor", [])) == 0 + assert lineage["reparenting_history"] == [str(env.initial_timeline)] + else: + assert len(lineage.get("original_ancestor", [])) == 0 + assert len(lineage.get("reparenting_history", [])) == 0 + + for name, _, _, rows, starts in expected_result: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1 + + # delete the new timeline to confirm it doesn't carry over the anything from the old timeline + client.timeline_delete(env.initial_tenant, branch_to_detach) + wait_timeline_detail_404(client, env.initial_tenant, branch_to_detach) + + # delete the after timeline + client.timeline_delete(env.initial_tenant, after) + wait_timeline_detail_404(client, env.initial_tenant, after) + + def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): """ Makes sure that the timeline is able to receive writes through-out the detach process. @@ -1217,8 +1352,10 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv ) +@pytest.mark.parametrize("detach_behavior", ["default", "v1", "v2"]) def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( neon_env_builder: NeonEnvBuilder, + detach_behavior: str, ): shard_count = 2 neon_env_builder.num_pageservers = shard_count @@ -1257,7 +1394,11 @@ def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( victim_http.configure_failpoints([(pausepoint, "pause"), (failpoint, "return")]) def detach_timeline(): - http.detach_ancestor(env.initial_tenant, detached_branch) + http.detach_ancestor( + env.initial_tenant, + detached_branch, + detach_behavior=detach_behavior if detach_behavior != "default" else None, + ) def paused_at_failpoint(): stuck.assert_log_contains(f"at failpoint {pausepoint}") diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 4865178ca8..b30c02e0e4 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -327,9 +327,9 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): log.info(f"pgbench run {i+1}/{PGBENCH_RUNS}") endpoint.safe_psql(f"create database {dbname}") connstr = endpoint.connstr(dbname=dbname) - # pgbench -i will automatically vacuum the tables. This creates the visibility map. - pg_bin.run(["pgbench", "-i", "-s", "10", connstr]) - # Freeze the tuples to set the initial frozen bit. + # Initialize the data set, but don't vacuum yet. + pg_bin.run(["pgbench", "-i", "-s", "8", "-n", connstr]) + # Vacuum to create the visibility map, and freeze the tuples to set the frozen bit. endpoint.safe_psql("vacuum freeze", dbname=dbname) # Run pgbench. pg_bin.run(["pgbench", "-c", "32", "-j", "8", "-T", "10", connstr]) @@ -354,19 +354,3 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): row = cur.fetchone() assert row is not None assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)" - - # Vacuum and freeze the tables, and check that the visibility map is still accurate. - for dbname in dbnames: - log.info(f"Vacuuming and checking visibility map for {dbname}") - with endpoint.cursor(dbname=dbname) as cur: - cur.execute("vacuum freeze") - - cur.execute("select count(*) from pg_check_visible('pgbench_accounts')") - row = cur.fetchone() - assert row is not None - assert row[0] == 0, f"{row[0]} inconsistent VM pages (visible)" - - cur.execute("select count(*) from pg_check_frozen('pgbench_accounts')") - row = cur.fetchone() - assert row is not None - assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)" diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index c5045fe4a4..55e38b29a2 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -27,7 +27,6 @@ from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( Endpoint, NeonEnvBuilder, - NeonPageserver, PgBin, PgProtocol, Safekeeper, @@ -38,8 +37,6 @@ from fixtures.pageserver.utils import ( assert_prefix_empty, assert_prefix_not_empty, timeline_delete_wait_completed, - wait_for_last_record_lsn, - wait_for_upload, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor @@ -55,9 +52,16 @@ from fixtures.safekeeper.http import ( TimelineCreateRequest, ) from fixtures.safekeeper.utils import wait_walreceivers_absent +from fixtures.safekeeper_utils import ( + is_flush_lsn_caught_up, + is_segment_offloaded, + is_wal_trimmed, + wait_lsn_force_checkpoint, + wait_lsn_force_checkpoint_at, + wait_lsn_force_checkpoint_at_sk, +) from fixtures.utils import ( PropagatingThread, - get_dir_size, query_scalar, run_only_on_default_postgres, skip_in_debug_build, @@ -69,68 +73,6 @@ if TYPE_CHECKING: from typing import Any, Self -def wait_lsn_force_checkpoint( - tenant_id: TenantId, - timeline_id: TimelineId, - endpoint: Endpoint, - ps: NeonPageserver, - pageserver_conn_options=None, -): - pageserver_conn_options = pageserver_conn_options or {} - lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") - - wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) - - -def wait_lsn_force_checkpoint_at_sk( - safekeeper: Safekeeper, - tenant_id: TenantId, - timeline_id: TimelineId, - ps: NeonPageserver, - pageserver_conn_options=None, -): - sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) - wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) - - -def wait_lsn_force_checkpoint_at( - lsn: Lsn, - tenant_id: TenantId, - timeline_id: TimelineId, - ps: NeonPageserver, - pageserver_conn_options=None, -): - """ - Wait until pageserver receives given lsn, force checkpoint and wait for - upload, i.e. remote_consistent_lsn advancement. - """ - pageserver_conn_options = pageserver_conn_options or {} - - auth_token = None - if "password" in pageserver_conn_options: - auth_token = pageserver_conn_options["password"] - - # wait for the pageserver to catch up - wait_for_last_record_lsn( - ps.http_client(auth_token=auth_token), - tenant_id, - timeline_id, - lsn, - ) - - # force checkpoint to advance remote_consistent_lsn - ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) - - # ensure that remote_consistent_lsn is advanced - wait_for_upload( - ps.http_client(auth_token=auth_token), - tenant_id, - timeline_id, - lsn, - ) - - @dataclass class TimelineMetrics: timeline_id: TimelineId @@ -475,31 +417,6 @@ def wait(f, desc, timeout=30, wait_f=None): wait_f() -def is_segment_offloaded( - sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn -): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"sk status is {tli_status}") - return tli_status.backup_lsn >= seg_end - - -def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"sk status is {tli_status}") - return tli_status.flush_lsn >= lsn - - -def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id)) - sk_wal_size_mb = sk_wal_size / 1024 / 1024 - log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") - return sk_wal_size_mb <= target_size_mb - - def test_wal_backup(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 remote_storage_kind = s3_storage() @@ -811,60 +728,6 @@ class ProposerPostgres(PgProtocol): self.pg_bin.run(args) -# insert wal in all safekeepers and run sync on proposer -def test_sync_safekeepers( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - port_distributor: PortDistributor, -): - # We don't really need the full environment for this test, just the - # safekeepers would be enough. - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - tenant_id = TenantId.generate() - timeline_id = TimelineId.generate() - - # write config for proposer - pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") - pg = ProposerPostgres( - pgdata_dir, pg_bin, tenant_id, timeline_id, "127.0.0.1", port_distributor.get_port() - ) - pg.create_dir_config(env.get_safekeeper_connstrs()) - - # valid lsn, which is not in the segment start, nor in zero segment - epoch_start_lsn = Lsn("0/16B9188") - begin_lsn = epoch_start_lsn - - # append and commit WAL - lsn_after_append = [] - for i in range(3): - res = env.safekeepers[i].append_logical_message( - tenant_id, - timeline_id, - { - "lm_prefix": "prefix", - "lm_message": "message", - "set_commit_lsn": True, - "send_proposer_elected": True, - "term": 2, - "begin_lsn": int(begin_lsn), - "epoch_start_lsn": int(epoch_start_lsn), - "truncate_lsn": int(epoch_start_lsn), - "pg_version": int(env.pg_version) * 10000, - }, - ) - lsn = Lsn(res["inserted_wal"]["end_lsn"]) - lsn_after_append.append(lsn) - log.info(f"safekeeper[{i}] lsn after append: {lsn}") - - # run sync safekeepers - lsn_after_sync = pg.sync_safekeepers() - log.info(f"lsn after sync = {lsn_after_sync}") - - assert all(lsn_after_sync == lsn for lsn in lsn_after_append) - - @pytest.mark.parametrize("auth_enabled", [False, True]) def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled @@ -1564,6 +1427,7 @@ class SafekeeperEnv: pg=self.port_distributor.get_port(), pg_tenant_only=self.port_distributor.get_port(), http=self.port_distributor.get_port(), + https=None, ) safekeeper_dir = self.repo_dir / f"sk{i}" @@ -1739,214 +1603,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) -@pytest.mark.parametrize("auth_enabled", [False, True]) -def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): - neon_env_builder.auth_enabled = auth_enabled - env = neon_env_builder.init_start() - - # FIXME: are these expected? - env.pageserver.allowed_errors.extend( - [ - ".*Timeline .* was not found in global map.*", - ".*Timeline .* was cancelled and cannot be used anymore.*", - ] - ) - - # Create two tenants: one will be deleted, other should be preserved. - tenant_id = env.initial_tenant - timeline_id_1 = env.create_branch("br1") # Active, delete explicitly - timeline_id_2 = env.create_branch("br2") # Inactive, delete explicitly - timeline_id_3 = env.create_branch("br3") # Active, delete with the tenant - timeline_id_4 = env.create_branch("br4") # Inactive, delete with the tenant - - tenant_id_other, timeline_id_other = env.create_tenant() - - # Populate branches - endpoint_1 = env.endpoints.create_start("br1") - endpoint_2 = env.endpoints.create_start("br2") - endpoint_3 = env.endpoints.create_start("br3") - endpoint_4 = env.endpoints.create_start("br4") - endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other) - for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE t(key int primary key)") - sk = env.safekeepers[0] - sk_data_dir = sk.data_dir - if not auth_enabled: - sk_http = sk.http_client() - sk_http_other = sk_http - else: - sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) - sk_http_other = sk.http_client( - auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) - ) - sk_http_noauth = sk.http_client(gen_sk_wide_token=False) - assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. - endpoint_2.stop_and_destroy() - endpoint_4.stop_and_destroy() - sk.stop() - sk.start() - - # Ensure connections to Safekeeper are established - for endpoint in [endpoint_1, endpoint_3, endpoint_other]: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("INSERT INTO t (key) VALUES (1)") - - # Stop all computes gracefully before safekeepers stop responding to them - endpoint_1.stop_and_destroy() - endpoint_3.stop_and_destroy() - - # Remove initial tenant's br1 (active) - assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Ensure repeated deletion succeeds - assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - if auth_enabled: - # Ensure we cannot delete the other tenant - for sk_h in [sk_http, sk_http_noauth]: - with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): - assert sk_h.timeline_delete(tenant_id_other, timeline_id_other) - with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): - assert sk_h.tenant_delete_force(tenant_id_other) - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant's br2 (inactive) - assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove non-existing branch, should succeed - assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant fully (two branches are active) - response = sk_http.tenant_delete_force(tenant_id) - assert response[str(timeline_id_3)]["dir_existed"] - assert not (sk_data_dir / str(tenant_id)).exists() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant again. - response = sk_http.tenant_delete_force(tenant_id) - # assert response == {} - assert not (sk_data_dir / str(tenant_id)).exists() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Ensure the other tenant still works - sk_http_other.timeline_status(tenant_id_other, timeline_id_other) - with closing(endpoint_other.connect()) as conn: - with conn.cursor() as cur: - cur.execute("INSERT INTO t (key) VALUES (123)") - - -def test_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder): - """ - Test deleting timelines on a safekeeper while they're under load. - - This should not happen under normal operation, but it can happen if - there is some rogue compute/pageserver that is writing/reading to a - safekeeper that we're migrating a timeline away from, or if the timeline - is being deleted while such a rogue client is running. - """ - neon_env_builder.auth_enabled = True - env = neon_env_builder.init_start() - - # Create two endpoints that will generate load - timeline_id_a = env.create_branch("deleteme_a") - timeline_id_b = env.create_branch("deleteme_b") - - endpoint_a = env.endpoints.create("deleteme_a") - endpoint_a.start() - endpoint_b = env.endpoints.create("deleteme_b") - endpoint_b.start() - - # Get tenant and timeline IDs - tenant_id = env.initial_tenant - - # Start generating load on both timelines - def generate_load(endpoint: Endpoint): - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") - while True: - try: - cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'") - except: # noqa - # Ignore errors since timeline may be deleted - break - - t_a = threading.Thread(target=generate_load, args=(endpoint_a,)) - t_b = threading.Thread(target=generate_load, args=(endpoint_b,)) - try: - t_a.start() - t_b.start() - - # Let the load run for a bit - log.info("Warming up...") - time.sleep(2) - - # Safekeeper errors will propagate to the pageserver: it is correct that these are - # logged at error severity because they indicate the pageserver is trying to read - # a timeline that it shouldn't. - env.pageserver.allowed_errors.extend( - [ - ".*Timeline.*was cancelled.*", - ".*Timeline.*was not found.*", - ] - ) - - # Try deleting timelines while under load - sk = env.safekeepers[0] - sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) - - # Delete first timeline - log.info(f"Deleting {timeline_id_a}...") - assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"] - - # Delete second timeline - log.info(f"Deleting {timeline_id_b}...") - assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"] - - # Verify timelines are gone from disk - sk_data_dir = sk.data_dir - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists() - # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists() - - finally: - log.info("Stopping endpoints...") - # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang - endpoint_a.stop(mode="immediate") - endpoint_b.stop(mode="immediate") - log.info("Joining threads...") - t_a.join() - t_b.join() - - # Basic pull_timeline test. # When live_sk_change is False, compute is restarted to change set of # safekeepers; otherwise it is live reload. @@ -2269,13 +1925,21 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() + # These are expected after timeline deletion on safekeepers. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found in global map.*", + ".*Timeline .* was cancelled and cannot be used anymore.*", + ] + ) + tenant_id = env.initial_tenant timeline_id = env.initial_timeline sk = env.safekeepers[0] http_cli = sk.http_client() - sk_id_1 = SafekeeperId(env.safekeepers[0].id, "localhost", sk.port.pg_tenant_only) + sk_id_1 = SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock # Request to switch before timeline creation should fail. @@ -2303,19 +1967,76 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): log.info(f"conf after restart: {after_restart}") assert after_restart.generation == 4 - # Switch into disjoint conf. - non_joint = Configuration(generation=5, members=[sk_id_2], new_members=None) + # Switch into non joint conf of which sk is not a member, must fail. + non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None) + with pytest.raises(requests.exceptions.HTTPError): + resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member) + + # Switch into good non joint conf. + non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None) resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint) log.info(f"non joint switch resp: {resp}") assert resp.previous_conf.generation == 4 - assert resp.current_conf.generation == 5 + assert resp.current_conf.generation == 6 - # Switch request to lower conf should be ignored. - lower_conf = Configuration(generation=3, members=[], new_members=None) - resp = http_cli.membership_switch(tenant_id, timeline_id, lower_conf) - log.info(f"lower switch resp: {resp}") - assert resp.previous_conf.generation == 5 - assert resp.current_conf.generation == 5 + # Switch request to lower conf should be rejected. + lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None) + with pytest.raises(requests.exceptions.HTTPError): + http_cli.membership_switch(tenant_id, timeline_id, lower_conf) + + # Now, exclude sk from the membership, timeline should be deleted. + excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None) + http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf) + with pytest.raises(requests.exceptions.HTTPError): + http_cli.timeline_status(tenant_id, timeline_id) + + +def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): + """ + Test that having neon.safekeepers starting with g#n: with non zero n enables + generations, which as a side effect disables automatic timeline creation. + + This is kind of bootstrapping test: here membership conf & timeline is + created manually, later storcon will do that. + """ + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + ps = env.pageservers[0] + ps_http_cli = ps.http_client() + + http_clis = [sk.http_client() for sk in env.safekeepers] + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + ep = env.endpoints.create("main", config_lines=config_lines) + + # expected to fail because timeline is not created on safekeepers + with pytest.raises(Exception, match=r".*timed out.*"): + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s") + # figure out initial LSN. + ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) + init_lsn = ps_timeline_detail["last_record_lsn"] + log.info(f"initial LSN: {init_lsn}") + # sk timeline creation request expects minor version + pg_version = ps_timeline_detail["pg_version"] * 10000 + # create inital mconf + sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers] + mconf = Configuration(generation=1, members=sk_ids, new_members=None) + create_r = TimelineCreateRequest( + tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + ) + log.info(f"sending timeline create: {create_r.to_json()}") + + for sk_http_cli in http_clis: + sk_http_cli.timeline_create(create_r) + # Once timeline created endpoint should start. + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 936c774657..56539a0a08 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -539,13 +539,16 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder): asyncio.run(run_recovery_uncommitted(env)) -async def run_wal_truncation(env: NeonEnv): +async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int): tenant_id = env.initial_tenant timeline_id = env.initial_timeline (sk1, sk2, sk3) = env.safekeepers - ep = env.endpoints.create_start("main") + config_lines = [ + f"neon.safekeeper_proto_version = {safekeeper_proto_version}", + ] + ep = env.endpoints.create_start("main", config_lines=config_lines) ep.safe_psql("create table t (key int, value text)") ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") @@ -572,6 +575,7 @@ async def run_wal_truncation(env: NeonEnv): sk2.start() ep = env.endpoints.create_start( "main", + config_lines=config_lines, ) ep.safe_psql("insert into t select generate_series(1, 200), 'payload'") @@ -590,11 +594,13 @@ async def run_wal_truncation(env: NeonEnv): # Simple deterministic test creating tail of WAL on safekeeper which is # truncated when majority without this sk elects walproposer starting earlier. -def test_wal_truncation(neon_env_builder: NeonEnvBuilder): +# Test both proto versions until we fully migrate. +@pytest.mark.parametrize("safekeeper_proto_version", [2, 3]) +def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_version: int): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - asyncio.run(run_wal_truncation(env)) + asyncio.run(run_wal_truncation(env, safekeeper_proto_version)) async def run_segment_init_failure(env: NeonEnv): diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 6254ab9b44..7b7592e740 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d +Subproject commit 7b7592e74059f795b64f06860cea97673418f35e diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 6ff5044377..ee794ba767 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 6ff50443773b69749e16da6db9d4f4b19064b4b7 +Subproject commit ee794ba767eef9b10260ef67d3a58084f1dabd6f diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 261ed10e9b..512856aaa8 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 261ed10e9b8c8dda01ad7aefb18e944e30aa161d +Subproject commit 512856aaa8bedbaa8f06811449518dcb0c2e5d8f diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 59b2fe851f..e5e87b9f52 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 59b2fe851f8e0595f6c830b90ee766f4f1c17a0f +Subproject commit e5e87b9f52d0eaeb83f3e2517bb9727aac37729b diff --git a/vendor/revisions.json b/vendor/revisions.json index f85cec3a0b..1d76e1da01 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.4", - "59b2fe851f8e0595f6c830b90ee766f4f1c17a0f" + "e5e87b9f52d0eaeb83f3e2517bb9727aac37729b" ], "v16": [ "16.8", - "261ed10e9b8c8dda01ad7aefb18e944e30aa161d" + "512856aaa8bedbaa8f06811449518dcb0c2e5d8f" ], "v15": [ "15.12", - "6ff50443773b69749e16da6db9d4f4b19064b4b7" + "ee794ba767eef9b10260ef67d3a58084f1dabd6f" ], "v14": [ "14.17", - "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d" + "7b7592e74059f795b64f06860cea97673418f35e" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 1b7c376560..6a726f0585 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -16,20 +16,26 @@ license.workspace = true ### BEGIN HAKARI SECTION [dependencies] ahash = { version = "0.8" } +anstream = { version = "0.6" } anyhow = { version = "1", features = ["backtrace"] } base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } -base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] } +base64-647d43efb71741da = { package = "base64", version = "0.21" } base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } camino = { version = "1", default-features = false, features = ["serde1"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } clap = { version = "4", features = ["derive", "env", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] } +const-oid = { version = "0.9", default-features = false, features = ["db", "std"] } crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] } -der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] } +der = { version = "0.7", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] } deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] } digest = { version = "0.10", features = ["mac", "oid", "std"] } +ecdsa = { version = "0.16", features = ["pem", "signing", "std", "verifying"] } either = { version = "1" } +elliptic-curve = { version = "0.13", default-features = false, features = ["digest", "hazmat", "jwk", "pem", "std"] } +env_filter = { version = "0.1", default-features = false, features = ["regex"] } +env_logger = { version = "0.11" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } form_urlencoded = { version = "1" } futures-channel = { version = "0.3", features = ["sink"] } @@ -62,6 +68,7 @@ num-iter = { version = "0.1", default-features = false, features = ["i128", "std num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } +p256 = { version = "0.13", features = ["jwk"] } parquet = { version = "53", default-features = false, features = ["zstd"] } prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } @@ -71,6 +78,7 @@ regex-syntax = { version = "0.8" } reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "rustls-tls-native-roots", "stream"] } rustls = { version = "0.23", default-features = false, features = ["logging", "ring", "std", "tls12"] } scopeguard = { version = "1" } +sec1 = { version = "0.7", features = ["pem", "serde", "std", "subtle"] } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["alloc", "raw_value"] } sha2 = { version = "0.10", features = ["asm", "oid"] }