diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md deleted file mode 100644 index 44b3094c24..0000000000 --- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md +++ /dev/null @@ -1,21 +0,0 @@ -## Release 202Y-MM-DD - -**NB: this PR must be merged only by 'Create a merge commit'!** - -### Checklist when preparing for release -- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b) -- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers? -- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan? - - - -### Checklist after release -- [ ] Make sure instructions from PRs included in this release and labeled `manual_release_instructions` are executed (either by you or by people who wrote them). -- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files)) -- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel -- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) -- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some) -- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1) -- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time) - - diff --git a/.github/scripts/generate_image_maps.py b/.github/scripts/generate_image_maps.py index 39ece5b38f..d8f910271b 100644 --- a/.github/scripts/generate_image_maps.py +++ b/.github/scripts/generate_image_maps.py @@ -1,14 +1,16 @@ import itertools import json import os +import sys -build_tag = os.environ["BUILD_TAG"] -branch = os.environ["BRANCH"] -dev_acr = os.environ["DEV_ACR"] -prod_acr = os.environ["PROD_ACR"] -dev_aws = os.environ["DEV_AWS"] -prod_aws = os.environ["PROD_AWS"] -aws_region = os.environ["AWS_REGION"] +source_tag = os.getenv("SOURCE_TAG") +target_tag = os.getenv("TARGET_TAG") +branch = os.getenv("BRANCH") +dev_acr = os.getenv("DEV_ACR") +prod_acr = os.getenv("PROD_ACR") +dev_aws = os.getenv("DEV_AWS") +prod_aws = os.getenv("PROD_AWS") +aws_region = os.getenv("AWS_REGION") components = { "neon": ["neon"], @@ -39,24 +41,23 @@ registries = { outputs: dict[str, dict[str, list[str]]] = {} -target_tags = [build_tag, "latest"] if branch == "main" else [build_tag] -target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"] +target_tags = [target_tag, "latest"] if branch == "main" else [target_tag] +target_stages = ( + ["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"] +) for component_name, component_images in components.items(): for stage in target_stages: - outputs[f"{component_name}-{stage}"] = dict( - [ - ( - f"docker.io/neondatabase/{component_image}:{build_tag}", - [ - f"{combo[0]}/{component_image}:{combo[1]}" - for combo in itertools.product(registries[stage], target_tags) - ], - ) - for component_image in component_images + outputs[f"{component_name}-{stage}"] = { + f"ghcr.io/neondatabase/{component_image}:{source_tag}": [ + f"{registry}/{component_image}:{tag}" + for registry, tag in itertools.product(registries[stage], target_tags) + if not (registry == "ghcr.io/neondatabase" and tag == source_tag) ] - ) + for component_image in component_images + } -with open(os.environ["GITHUB_OUTPUT"], "a") as f: +with open(os.getenv("GITHUB_OUTPUT", "/dev/null"), "a") as f: for key, value in outputs.items(): f.write(f"{key}={json.dumps(value)}\n") + print(f"Image map for {key}:\n{json.dumps(value, indent=2)}\n\n", file=sys.stderr) diff --git a/.github/scripts/lint-release-pr.sh b/.github/scripts/lint-release-pr.sh new file mode 100755 index 0000000000..6dc5b99f0e --- /dev/null +++ b/.github/scripts/lint-release-pr.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash + +set -euo pipefail + +DOCS_URL="https://docs.neon.build/overview/repositories/neon.html" + +message() { + if [[ -n "${GITHUB_PR_NUMBER:-}" ]]; then + gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --edit-last --body "$1" \ + || gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --body "$1" + fi + echo "$1" +} + +report_error() { + message "❌ $1 + For more details, see the documentation: ${DOCS_URL}" + + exit 1 +} + +case "$RELEASE_BRANCH" in + "release") COMPONENT="Storage" ;; + "release-proxy") COMPONENT="Proxy" ;; + "release-compute") COMPONENT="Compute" ;; + *) + report_error "Unknown release branch: ${RELEASE_BRANCH}" + ;; +esac + + +# Identify main and release branches +MAIN_BRANCH="origin/main" +REMOTE_RELEASE_BRANCH="origin/${RELEASE_BRANCH}" + +# Find merge base +MERGE_BASE=$(git merge-base "${MAIN_BRANCH}" "${REMOTE_RELEASE_BRANCH}") +echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}" + +# Get the HEAD commit (last commit in PR, expected to be the merge commit) +LAST_COMMIT=$(git rev-parse HEAD) + +MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}") +EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$" + +if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then + report_error "Merge commit message does not match expected pattern: ' release YYYY-MM-DD' + Expected component: ${COMPONENT} + Found: '${MERGE_COMMIT_MESSAGE}'" +fi +echo "✅ Merge commit message is correctly formatted: '${MERGE_COMMIT_MESSAGE}'" + +LAST_COMMIT_PARENTS=$(git cat-file -p "${LAST_COMMIT}" | jq -sR '[capture("parent (?[0-9a-f]{40})"; "g") | .parent]') + +if [[ "$(echo "${LAST_COMMIT_PARENTS}" | jq 'length')" -ne 2 ]]; then + report_error "Last commit must be a merge commit with exactly two parents" +fi + +EXPECTED_RELEASE_HEAD=$(git rev-parse "${REMOTE_RELEASE_BRANCH}") +if echo "${LAST_COMMIT_PARENTS}" | jq -e --arg rel "${EXPECTED_RELEASE_HEAD}" 'index($rel) != null' > /dev/null; then + LINEAR_HEAD=$(echo "${LAST_COMMIT_PARENTS}" | jq -r '[.[] | select(. != $rel)][0]' --arg rel "${EXPECTED_RELEASE_HEAD}") +else + report_error "Last commit must merge the release branch (${RELEASE_BRANCH})" +fi +echo "✅ Last commit correctly merges the previous commit and the release branch" +echo "Top commit of linear history: ${LINEAR_HEAD}" + +MERGE_COMMIT_TREE=$(git rev-parse "${LAST_COMMIT}^{tree}") +LINEAR_HEAD_TREE=$(git rev-parse "${LINEAR_HEAD}^{tree}") + +if [[ "${MERGE_COMMIT_TREE}" != "${LINEAR_HEAD_TREE}" ]]; then + report_error "Tree of merge commit (${MERGE_COMMIT_TREE}) does not match tree of linear history head (${LINEAR_HEAD_TREE}) + This indicates that the merge of ${RELEASE_BRANCH} into this branch was not performed using the merge strategy 'ours'" +fi +echo "✅ Merge commit tree matches the linear history head" + +EXPECTED_PREVIOUS_COMMIT="${LINEAR_HEAD}" + +# Now traverse down the history, ensuring each commit has exactly one parent +CURRENT_COMMIT="${EXPECTED_PREVIOUS_COMMIT}" +while [[ "${CURRENT_COMMIT}" != "${MERGE_BASE}" && "${CURRENT_COMMIT}" != "${EXPECTED_RELEASE_HEAD}" ]]; do + CURRENT_COMMIT_PARENTS=$(git cat-file -p "${CURRENT_COMMIT}" | jq -sR '[capture("parent (?[0-9a-f]{40})"; "g") | .parent]') + + if [[ "$(echo "${CURRENT_COMMIT_PARENTS}" | jq 'length')" -ne 1 ]]; then + report_error "Commit ${CURRENT_COMMIT} must have exactly one parent" + fi + + NEXT_COMMIT=$(echo "${CURRENT_COMMIT_PARENTS}" | jq -r '.[0]') + + if [[ "${NEXT_COMMIT}" == "${MERGE_BASE}" ]]; then + echo "✅ Reached merge base (${MERGE_BASE})" + PR_BASE="${MERGE_BASE}" + elif [[ "${NEXT_COMMIT}" == "${EXPECTED_RELEASE_HEAD}" ]]; then + echo "✅ Reached release branch (${EXPECTED_RELEASE_HEAD})" + PR_BASE="${EXPECTED_RELEASE_HEAD}" + elif [[ -z "${NEXT_COMMIT}" ]]; then + report_error "Unexpected end of commit history before reaching merge base" + fi + + # Move to the next commit in the chain + CURRENT_COMMIT="${NEXT_COMMIT}" +done + +echo "✅ All commits are properly ordered and linear" +echo "✅ Release PR structure is valid" + +echo + +message "Commits that are part of this release: +$(git log --oneline "${PR_BASE}..${LINEAR_HEAD}")" diff --git a/.github/scripts/previous-releases.jq b/.github/scripts/previous-releases.jq index b0b00bce18..51204da099 100644 --- a/.github/scripts/previous-releases.jq +++ b/.github/scripts/previous-releases.jq @@ -17,6 +17,12 @@ ({}; .[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end)) +# Ensure that each component exists, or fail +| (["storage", "compute", "proxy"] - (keys)) as $missing +| if ($missing | length) > 0 then + "Error: Found no release for \($missing | join(", "))!\n" | halt_error(1) + else . end + # Convert the resulting object into an array of formatted strings | to_entries | map("\(.key)=\(.value.full)") diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index 71aef1430e..0703e2c4d6 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -27,10 +27,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 6a2070424a..b950187fe1 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -39,15 +39,15 @@ env: jobs: build-neon: - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} permissions: id-token: write # aws-actions/configure-aws-credentials contents: read container: image: ${{ inputs.build-tools-image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} # Raise locked memory limit for tokio-epoll-uring. # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), # io_uring will account the memory of the CQ and SQ as locked. @@ -318,12 +318,12 @@ jobs: contents: read statuses: write needs: [ build-neon ] - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} container: image: ${{ inputs.build-tools-image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: diff --git a/.github/workflows/_check-codestyle-python.yml b/.github/workflows/_check-codestyle-python.yml index 9ae28a1379..868ac15f3c 100644 --- a/.github/workflows/_check-codestyle-python.yml +++ b/.github/workflows/_check-codestyle-python.yml @@ -15,11 +15,15 @@ defaults: jobs: check-codestyle-python: runs-on: [ self-hosted, small ] + + permissions: + packages: read + container: image: ${{ inputs.build-tools-image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml index c4c76914aa..83eeb83e45 100644 --- a/.github/workflows/_check-codestyle-rust.yml +++ b/.github/workflows/_check-codestyle-rust.yml @@ -23,14 +23,17 @@ jobs: check-codestyle-rust: strategy: matrix: - arch: ${{ fromJson(inputs.archs) }} - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + arch: ${{ fromJSON(inputs.archs) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + + permissions: + packages: read container: image: ${{ inputs.build-tools-image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml index 3c130c8229..9b1d1aa454 100644 --- a/.github/workflows/_create-release-pr.yml +++ b/.github/workflows/_create-release-pr.yml @@ -7,8 +7,8 @@ on: description: 'Component name' required: true type: string - release-branch: - description: 'Release branch' + source-branch: + description: 'Source branch' required: true type: string secrets: @@ -30,17 +30,25 @@ jobs: steps: - uses: actions/checkout@v4 with: - ref: main + ref: ${{ inputs.source-branch }} + fetch-depth: 0 - name: Set variables id: vars env: COMPONENT_NAME: ${{ inputs.component-name }} - RELEASE_BRANCH: ${{ inputs.release-branch }} + RELEASE_BRANCH: >- + ${{ + false + || inputs.component-name == 'Storage' && 'release' + || inputs.component-name == 'Proxy' && 'release-proxy' + || inputs.component-name == 'Compute' && 'release-compute' + }} run: | today=$(date +'%Y-%m-%d') echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT} echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT} + echo "release-branch=${RELEASE_BRANCH}" | tee -a ${GITHUB_OUTPUT} - name: Configure git run: | @@ -49,31 +57,36 @@ jobs: - name: Create RC branch env: + RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} TITLE: ${{ steps.vars.outputs.title }} run: | - git checkout -b "${RC_BRANCH}" + git switch -c "${RC_BRANCH}" - # create an empty commit to distinguish workflow runs - # from other possible releases from the same commit - git commit --allow-empty -m "${TITLE}" + # Manually create a merge commit on the current branch, keeping the + # tree and setting the parents to the current HEAD and the HEAD of the + # release branch. This commit is what we'll fast-forward the release + # branch to when merging the release branch. + # For details on why, look at + # https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs + current_tree=$(git rev-parse 'HEAD^{tree}') + release_head=$(git rev-parse "origin/${RELEASE_BRANCH}") + current_head=$(git rev-parse HEAD) + merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}") + + # Fast-forward the current branch to the newly created merge_commit + git merge --ff-only ${merge_commit} git push origin "${RC_BRANCH}" - - name: Create a PR into ${{ inputs.release-branch }} + - name: Create a PR into ${{ steps.vars.outputs.release-branch }} env: GH_TOKEN: ${{ secrets.ci-access-token }} RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} - RELEASE_BRANCH: ${{ inputs.release-branch }} + RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} TITLE: ${{ steps.vars.outputs.title }} run: | - cat << EOF > body.md - ## ${TITLE} - - **Please merge this Pull Request using 'Create a merge commit' button** - EOF - gh pr create --title "${TITLE}" \ - --body-file "body.md" \ + --body "" \ --head "${RC_BRANCH}" \ --base "${RELEASE_BRANCH}" diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml index a3fc125648..44802f0525 100644 --- a/.github/workflows/_meta.yml +++ b/.github/workflows/_meta.yml @@ -21,9 +21,16 @@ on: run-kind: description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`" value: ${{ jobs.tags.outputs.run-kind }} + release-pr-run-id: + description: "Only available if `run-kind in [storage-release, proxy-release, compute-release]`. Contains the run ID of the `Build and Test` workflow, assuming one with the current commit can be found." + value: ${{ jobs.tags.outputs.release-pr-run-id }} permissions: {} +defaults: + run: + shell: bash -euo pipefail {0} + jobs: tags: runs-on: ubuntu-22.04 @@ -33,6 +40,7 @@ jobs: proxy: ${{ steps.previous-releases.outputs.proxy }} storage: ${{ steps.previous-releases.outputs.storage }} run-kind: ${{ steps.run-kind.outputs.run-kind }} + release-pr-run-id: ${{ steps.release-pr-run-id.outputs.release-pr-run-id }} permissions: contents: read steps: @@ -83,7 +91,11 @@ jobs: echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr) - BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') + BUILD_AND_TEST_RUN_ID=$(gh api --paginate \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=${CURRENT_SHA}&branch=${CURRENT_BRANCH}" \ + | jq '[.workflow_runs[] | select(.name == "Build and Test")][0].id // ("Error: No matching workflow run found." | halt_error(1))') echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT ;; workflow-dispatch) @@ -105,3 +117,13 @@ jobs: "/repos/${GITHUB_REPOSITORY}/releases" \ | jq -f .github/scripts/previous-releases.jq -r \ | tee -a "${GITHUB_OUTPUT}" + + - name: Get the release PR run ID + id: release-pr-run-id + if: ${{ contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + run: | + RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from RC PR!" | halt_error(1))') + echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index 2dab665f40..949eeca4b1 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -89,7 +89,7 @@ jobs: uses: docker/login-action@v3 with: registry: ghcr.io - username: ${{ github.repository_owner }} + username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Log in to Docker Hub diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index ff7db02e42..0cffb3787b 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -87,10 +87,10 @@ jobs: runs-on: ${{ matrix.RUNNER }} container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -190,10 +190,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -245,10 +245,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -352,7 +352,7 @@ jobs: region_id_default=${{ env.DEFAULT_REGION_ID }} runner_default='["self-hosted", "us-east-2", "x64"]' runner_azure='["self-hosted", "eastus2", "x64"]' - image_default="neondatabase/build-tools:pinned-bookworm" + image_default="ghcr.io/neondatabase/build-tools:pinned-bookworm" matrix='{ "pg_version" : [ 16 @@ -368,18 +368,18 @@ jobs: "db_size": [ "10gb" ], "runner": ['"$runner_default"'], "image": [ "'"$image_default"'" ], - "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] + "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "ghcr.io/neondatabase/build-tools:pinned-bookworm" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "ghcr.io/neondatabase/build-tools:pinned-bookworm" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "ghcr.io/neondatabase/build-tools:pinned-bookworm" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then @@ -441,7 +441,7 @@ jobs: strategy: fail-fast: false - matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-matrix)}} + matrix: ${{fromJSON(needs.generate-matrices.outputs.pgbench-compare-matrix)}} env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" @@ -457,8 +457,8 @@ jobs: container: image: ${{ matrix.image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init # Increase timeout to 8h, default timeout is 6h @@ -483,7 +483,7 @@ jobs: aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project - if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) + if: contains(fromJSON('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -523,7 +523,7 @@ jobs: # without (neonvm-captest-new) # and with (neonvm-captest-new-many-tables) many relations in the database - name: Create many relations before the run - if: contains(fromJson('["neonvm-captest-new-many-tables"]'), matrix.platform) + if: contains(fromJSON('["neonvm-captest-new-many-tables"]'), matrix.platform) uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} @@ -642,10 +642,10 @@ jobs: runs-on: ${{ matrix.RUNNER }} container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -753,7 +753,7 @@ jobs: strategy: fail-fast: false - matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} + matrix: ${{ fromJSON(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -767,10 +767,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init # Increase timeout to 12h, default timeout is 6h @@ -880,7 +880,7 @@ jobs: strategy: fail-fast: false - matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }} + matrix: ${{ fromJSON(needs.generate-matrices.outputs.tpch-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -892,10 +892,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -999,7 +999,7 @@ jobs: strategy: fail-fast: false - matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} + matrix: ${{ fromJSON(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -1011,10 +1011,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 0a7f0cd7a0..f7c91e7412 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -19,7 +19,7 @@ on: value: ${{ jobs.check-image.outputs.tag }} image: description: "build-tools image" - value: neondatabase/build-tools:${{ jobs.check-image.outputs.tag }} + value: ghcr.io/neondatabase/build-tools:${{ jobs.check-image.outputs.tag }} defaults: run: @@ -49,9 +49,18 @@ jobs: everything: ${{ steps.set-more-variables.outputs.everything }} found: ${{ steps.set-more-variables.outputs.found }} + permissions: + packages: read + steps: - uses: actions/checkout@v4 + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Set variables id: set-variables env: @@ -70,12 +79,12 @@ jobs: env: IMAGE_TAG: ${{ steps.set-variables.outputs.image-tag }} EVERYTHING: | - ${{ contains(fromJson(steps.set-variables.outputs.archs), 'x64') && - contains(fromJson(steps.set-variables.outputs.archs), 'arm64') && - contains(fromJson(steps.set-variables.outputs.debians), 'bullseye') && - contains(fromJson(steps.set-variables.outputs.debians), 'bookworm') }} + ${{ contains(fromJSON(steps.set-variables.outputs.archs), 'x64') && + contains(fromJSON(steps.set-variables.outputs.archs), 'arm64') && + contains(fromJSON(steps.set-variables.outputs.debians), 'bullseye') && + contains(fromJSON(steps.set-variables.outputs.debians), 'bookworm') }} run: | - if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then + if docker manifest inspect ghcr.io/neondatabase/build-tools:${IMAGE_TAG}; then found=true else found=false @@ -90,10 +99,13 @@ jobs: strategy: matrix: - arch: ${{ fromJson(needs.check-image.outputs.archs) }} - debian: ${{ fromJson(needs.check-image.outputs.debians) }} + arch: ${{ fromJSON(needs.check-image.outputs.archs) }} + debian: ${{ fromJSON(needs.check-image.outputs.debians) }} - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + permissions: + packages: write + + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - uses: actions/checkout@v4 @@ -108,6 +120,12 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - uses: docker/login-action@v3 with: registry: cache.neon.build @@ -126,35 +144,44 @@ jobs: cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian, matrix.arch) || '' }} tags: | - neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian }}-${{ matrix.arch }} + ghcr.io/neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian }}-${{ matrix.arch }} merge-images: needs: [ check-image, build-image ] runs-on: ubuntu-22.04 + permissions: + packages: write + steps: - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Create multi-arch image env: DEFAULT_DEBIAN_VERSION: bookworm - ARCHS: ${{ join(fromJson(needs.check-image.outputs.archs), ' ') }} - DEBIANS: ${{ join(fromJson(needs.check-image.outputs.debians), ' ') }} + ARCHS: ${{ join(fromJSON(needs.check-image.outputs.archs), ' ') }} + DEBIANS: ${{ join(fromJSON(needs.check-image.outputs.debians), ' ') }} EVERYTHING: ${{ needs.check-image.outputs.everything }} IMAGE_TAG: ${{ needs.check-image.outputs.tag }} run: | for debian in ${DEBIANS}; do - tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian}") + tags=("-t" "ghcr.io/neondatabase/build-tools:${IMAGE_TAG}-${debian}") if [ "${EVERYTHING}" == "true" ] && [ "${debian}" == "${DEFAULT_DEBIAN_VERSION}" ]; then - tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}") + tags+=("-t" "ghcr.io/neondatabase/build-tools:${IMAGE_TAG}") fi for arch in ${ARCHS}; do - tags+=("neondatabase/build-tools:${IMAGE_TAG}-${debian}-${arch}") + tags+=("ghcr.io/neondatabase/build-tools:${IMAGE_TAG}-${debian}-${arch}") done docker buildx imagetools create "${tags[@]}" diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index 347a511e98..b24a872152 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -40,7 +40,7 @@ jobs: runs-on: macos-15 strategy: matrix: - postgres-version: ${{ inputs.rebuild_everything && fromJson('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }} + postgres-version: ${{ inputs.rebuild_everything && fromJSON('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }} env: # Use release build only, to have less debug info around # Hence keeping target/ (and general cache size) smaller diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1c0971a49d..bc88da316a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -77,20 +77,23 @@ jobs: secrets: inherit check-codestyle-python: - needs: [ check-permissions, build-build-tools-image ] + needs: [ meta, check-permissions, build-build-tools-image ] + # No need to run on `main` because we this in the merge queue + if: ${{ needs.meta.outputs.run-kind == 'pr' }} uses: ./.github/workflows/_check-codestyle-python.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm secrets: inherit check-codestyle-jsonnet: - needs: [ check-permissions, build-build-tools-image ] + needs: [ meta, check-permissions, build-build-tools-image ] + if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }} runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -156,7 +159,9 @@ jobs: pass_if_unchanged: true check-codestyle-rust: - needs: [ check-permissions, build-build-tools-image ] + needs: [ meta, check-permissions, build-build-tools-image ] + # No need to run on `main` because we this in the merge queue + if: ${{ needs.meta.outputs.run-kind == 'pr' }} uses: ./.github/workflows/_check-codestyle-rust.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -164,8 +169,8 @@ jobs: secrets: inherit check-dependencies-rust: - needs: [ files-changed, build-build-tools-image ] - if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' }} + needs: [ meta, files-changed, build-build-tools-image ] + if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr' }} uses: ./.github/workflows/cargo-deny.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -173,12 +178,13 @@ jobs: build-and-test-locally: needs: [ meta, build-build-tools-image ] + if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }} strategy: fail-fast: false matrix: arch: [ x64, arm64 ] # Do not build or run tests in debug for release branches - build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }} + build-type: ${{ fromJSON((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }} include: - build-type: release arch: arm64 @@ -209,8 +215,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Checkout @@ -248,8 +254,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: @@ -314,8 +320,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -367,8 +373,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init strategy: fail-fast: false @@ -470,14 +476,20 @@ jobs: }) trigger-e2e-tests: - # Depends on jobs that can get skipped + # !failure() && !cancelled() because it depends on jobs that can get skipped if: >- ${{ ( - !github.event.pull_request.draft - || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') - || contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) - ) && !failure() && !cancelled() + ( + needs.meta.outputs.run-kind == 'pr' + && ( + !github.event.pull_request.draft + || contains(github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') + ) + ) + || contains(fromJSON('["push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + ) + && !failure() && !cancelled() }} needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ] uses: ./.github/workflows/trigger-e2e-tests.yml @@ -487,12 +499,15 @@ jobs: neon-image-arch: needs: [ check-permissions, build-build-tools-image, meta ] - if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + + permissions: + packages: write steps: - uses: actions/checkout@v4 @@ -509,6 +524,12 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - uses: docker/login-action@v3 with: registry: cache.neon.build @@ -533,37 +554,40 @@ jobs: cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | - neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }} + ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: needs: [ neon-image-arch, meta ] - if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read + packages: write steps: - uses: docker/login-action@v3 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} - name: Create multi-arch image run: | - docker buildx imagetools create -t neondatabase/neon:${{ needs.meta.outputs.build-tag }} \ - -t neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \ - neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \ - neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 + docker buildx imagetools create -t ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }} \ + -t ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \ + ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \ + ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 compute-node-image-arch: needs: [ check-permissions, build-build-tools-image, meta ] - if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read + packages: write strategy: fail-fast: false matrix: @@ -582,7 +606,7 @@ jobs: debian: bookworm arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - uses: actions/checkout@v4 @@ -604,6 +628,12 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - uses: docker/login-action@v3 with: registry: cache.neon.build @@ -627,7 +657,7 @@ jobs: cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} + ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg >= 'v16' @@ -647,15 +677,16 @@ jobs: target: extension-tests cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} + ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: needs: [ compute-node-image-arch, meta ] - if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read + packages: write runs-on: ubuntu-22.04 strategy: @@ -674,28 +705,32 @@ jobs: steps: - uses: docker/login-action@v3 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ - -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image if: matrix.version.pg >= 'v16' run: | - docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ - -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 vm-compute-node-image-arch: needs: [ check-permissions, meta, compute-node-image ] - if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + permissions: + contents: read + packages: write strategy: fail-fast: false matrix: @@ -723,31 +758,34 @@ jobs: - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/login-action@v3 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} + docker pull ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -size=2G \ -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ - -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \ + -src=ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -dst=ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \ -target-arch=linux/${{ matrix.arch }} - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} + docker push ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} vm-compute-node-image: needs: [ vm-compute-node-image-arch, meta ] - if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + permissions: + packages: write runs-on: ubuntu-22.04 strategy: matrix: @@ -760,38 +798,54 @@ jobs: steps: - uses: docker/login-action@v3 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ - neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \ - neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64 + docker buildx imagetools create -t ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \ + ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64 test-images: needs: [ check-permissions, meta, neon-image, compute-node-image ] # Depends on jobs that can get skipped - if: "!failure() && !cancelled()" + if: >- + ${{ + !failure() + && !cancelled() + && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + }} strategy: fail-fast: false matrix: arch: [ x64, arm64 ] pg_version: [v16, v17] - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + permissions: + packages: read + + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - uses: actions/checkout@v4 - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 + - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # `ghcr.io/neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like # Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: [] @@ -800,9 +854,9 @@ jobs: # Ensure that we don't have bad versions. - name: Verify image versions shell: bash # ensure no set -e for better error messages - if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} run: | - pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + pageserver_version=$(docker run --rm ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" @@ -821,19 +875,19 @@ jobs: env: TAG: >- ${{ - contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release || needs.meta.outputs.build-tag }} COMPUTE_TAG: >- ${{ - contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) && needs.meta.outputs.previous-compute-release || needs.meta.outputs.build-tag }} TEST_EXTENSIONS_TAG: >- ${{ - contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) && 'latest' || needs.meta.outputs.build-tag }} @@ -885,7 +939,13 @@ jobs: id: generate run: python3 .github/scripts/generate_image_maps.py env: - BUILD_TAG: "${{ needs.meta.outputs.build-tag }}" + SOURCE_TAG: >- + ${{ + contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.release-pr-run-id + || needs.meta.outputs.build-tag + }} + TARGET_TAG: ${{ needs.meta.outputs.build-tag }} BRANCH: "${{ github.ref_name }}" DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" @@ -895,7 +955,7 @@ jobs: push-neon-image-dev: needs: [ meta, generate-image-maps, neon-image ] - if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login @@ -913,7 +973,7 @@ jobs: push-compute-image-dev: needs: [ meta, generate-image-maps, vm-compute-node-image ] - if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login @@ -967,16 +1027,64 @@ jobs: acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} secrets: inherit - # This is a bit of a special case so we're not using a generated image map. - add-latest-tag-to-neon-extensions-test-image: - if: github.ref_name == 'main' + push-neon-test-extensions-image-dockerhub: + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} needs: [ meta, compute-node-image ] uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + packages: write + id-token: write with: image-map: | { - "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], - "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" + ], + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" + ] + } + secrets: inherit + + add-latest-tag-to-neon-test-extensions-image: + if: ${{ needs.meta.outputs.run-kind == 'push-main' }} + needs: [ meta, compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + packages: write + id-token: write + with: + image-map: | + { + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ + "docker.io/neondatabase/neon-test-extensions-v16:latest", + "ghcr.io/neondatabase/neon-test-extensions-v16:latest" + ], + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ + "docker.io/neondatabase/neon-test-extensions-v17:latest", + "ghcr.io/neondatabase/neon-test-extensions-v17:latest" + ] + } + secrets: inherit + + add-release-tag-to-neon-test-extensions-image: + if: ${{ needs.meta.outputs.run-kind == 'compute-release' }} + needs: [ meta ] + uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + packages: write + id-token: write + with: + image-map: | + { + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.release-pr-run-id }}": [ + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}", + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" + ], + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.release-pr-run-id }}": [ + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}", + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" + ] } secrets: inherit @@ -1235,7 +1343,7 @@ jobs: # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: - needs: [ deploy ] + needs: [ meta, deploy ] permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -1245,37 +1353,6 @@ jobs: runs-on: ubuntu-22.04 steps: - - name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR - id: fetch-last-release-pr-info - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - branch_name_and_pr_number=$(gh pr list \ - --repo "${GITHUB_REPOSITORY}" \ - --base release \ - --state merged \ - --limit 10 \ - --json mergeCommit,headRefName,number \ - --jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }") - branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name') - pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number') - - run_id=$(gh run list \ - --repo "${GITHUB_REPOSITORY}" \ - --workflow build_and_test.yml \ - --branch "${branch_name}" \ - --json databaseId \ - --limit 1 \ - --jq '.[].databaseId') - - last_commit_sha=$(gh pr view "${pr_number}" \ - --repo "${GITHUB_REPOSITORY}" \ - --json commits \ - --jq '.commits[-1].oid') - - echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT} - echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT} - - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 @@ -1286,8 +1363,8 @@ jobs: env: BUCKET: neon-github-public-dev AWS_REGION: eu-central-1 - COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }} - RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }} + COMMIT_SHA: ${{ github.sha }} + RUN_ID: ${{ needs.meta.outputs.release-pr-run-id }} run: | old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}" new_prefix="artifacts/latest" @@ -1369,12 +1446,12 @@ jobs: if: | contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') - || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true') - || needs.build-and-test-locally.result == 'skipped' - || needs.check-codestyle-python.result == 'skipped' - || needs.check-codestyle-rust.result == 'skipped' + || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr') + || (needs.build-and-test-locally.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') + || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') + || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') || needs.files-changed.result == 'skipped' || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)) - || needs.test-images.result == 'skipped' + || (needs.test-images.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml index e40b02b5d2..389b59c1a5 100644 --- a/.github/workflows/build_and_test_with_sanitizers.yml +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -94,8 +94,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/cargo-deny.yml b/.github/workflows/cargo-deny.yml index 222f7e9787..a4f476c99a 100644 --- a/.github/workflows/cargo-deny.yml +++ b/.github/workflows/cargo-deny.yml @@ -24,11 +24,14 @@ jobs: runs-on: [self-hosted, small] + permissions: + packages: read + container: - image: ${{ inputs.build-tools-image || 'neondatabase/build-tools:pinned' }} + image: ${{ inputs.build-tools-image || 'ghcr.io/neondatabase/build-tools:pinned' }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 606e1c0862..566629e15c 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -37,10 +37,10 @@ jobs: runs-on: us-east-2 container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/fast-forward.yml b/.github/workflows/fast-forward.yml new file mode 100644 index 0000000000..bc63ff120d --- /dev/null +++ b/.github/workflows/fast-forward.yml @@ -0,0 +1,36 @@ +name: Fast forward merge +on: + pull_request: + types: [labeled] + branches: + - release + - release-proxy + - release-compute + +jobs: + fast-forward: + if: ${{ github.event.label.name == 'fast-forward' }} + runs-on: ubuntu-22.04 + + steps: + - name: Remove fast-forward label to PR + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + gh pr edit ${{ github.event.pull_request.number }} --repo "${GITHUB_REPOSITORY}" --remove-label "fast-forward" + + - name: Fast forwarding + uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979 + # See https://docs.github.com/en/graphql/reference/enums#mergestatestatus + if: ${{ github.event.pull_request.mergeable_state == 'clean' }} + with: + merge: true + comment: on-error + github_token: ${{ secrets.CI_ACCESS_TOKEN }} + + - name: Comment if mergeable_state is not clean + if: ${{ github.event.pull_request.mergeable_state != 'clean' }} + run: | + gh pr comment ${{ github.event.pull_request.number }} \ + --repo "${GITHUB_REPOSITORY}" \ + --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`." diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index c20c5890f9..37ee371311 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -67,10 +67,10 @@ jobs: PGCOPYDB_LIB_PATH: /pgcopydb/lib runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init timeout-minutes: 1440 diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml index f33e11cd08..fea21877f8 100644 --- a/.github/workflows/large_oltp_benchmark.yml +++ b/.github/workflows/large_oltp_benchmark.yml @@ -2,8 +2,8 @@ name: large oltp benchmark on: # uncomment to run on push for debugging your PR - push: - branches: [ bodobolero/synthetic_oltp_workload ] + #push: + # branches: [ bodobolero/synthetic_oltp_workload ] schedule: # * is a special character in YAML so you have to quote this string @@ -12,7 +12,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks + - cron: '0 15 * * 0,2,4' # run on Sunday, Tuesday, Thursday at 3 PM UTC workflow_dispatch: # adds ability to run this manually defaults: @@ -22,7 +22,7 @@ defaults: concurrency: # Allow only one workflow globally because we need dedicated resources which only exist once group: large-oltp-bench-workflow - cancel-in-progress: true + cancel-in-progress: false jobs: oltp: @@ -31,9 +31,9 @@ jobs: matrix: include: - target: new_branch - custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 + custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 - target: reuse_branch - custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 + custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results permissions: contents: write @@ -46,19 +46,20 @@ jobs: PG_VERSION: 16 # pre-determined by pre-determined project TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }} PLATFORM: ${{ matrix.target }} runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init - # Increase timeout to 8h, default timeout is 6h - timeout-minutes: 480 + # Increase timeout to 2 days, default timeout is 6h - database maintenance can take a long time + # (normally 1h pgbench, 3h vacuum analyze 3.5h re-index) x 2 = 15h, leave some buffer for regressions + # in one run vacuum didn't finish within 12 hours + timeout-minutes: 2880 steps: - uses: actions/checkout@v4 @@ -89,29 +90,45 @@ jobs: - name: Set up Connection String id: set-up-connstr run: | - case "${{ matrix.target }}" in - new_branch) - CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }} - ;; - reuse_branch) - CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} - ;; - *) - echo >&2 "Unknown target=${{ matrix.target }}" - exit 1 - ;; - esac + case "${{ matrix.target }}" in + new_branch) + CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }} + ;; + reuse_branch) + CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} + ;; + *) + echo >&2 "Unknown target=${{ matrix.target }}" + exit 1 + ;; + esac - echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}" - - name: Benchmark pgbench with custom-scripts + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT + + - name: Delete rows from prior runs in reuse branch + if: ${{ matrix.target == 'reuse_branch' }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }} + PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config + PSQL: /tmp/neon/pg_install/v16/bin/psql + PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib + run: | + echo "$(date '+%Y-%m-%d %H:%M:%S') - Deleting rows in table webhook.incoming_webhooks from prior runs" + export LD_LIBRARY_PATH=${PG_16_LIB_PATH} + ${PSQL} "${BENCHMARK_CONNSTR}" -c "SET statement_timeout = 0; DELETE FROM webhook.incoming_webhooks WHERE created_at > '2025-02-27 23:59:59+00';" + echo "$(date '+%Y-%m-%d %H:%M:%S') - Finished deleting rows in table webhook.incoming_webhooks from prior runs" + + - name: Benchmark pgbench with custom-scripts uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: ${{ env.SAVE_PERF_REPORT }} - extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant + save_perf_report: true + extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_pgbench pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: @@ -119,6 +136,21 @@ jobs: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + - name: Benchmark database maintenance + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 172800 -k test_perf_oltp_large_tenant_maintenance + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + - name: Delete Neon Branch for large tenant if: ${{ always() && matrix.target == 'new_branch' }} uses: ./.github/actions/neon-branch-delete @@ -127,6 +159,13 @@ jobs: branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} + - name: Configure AWS credentials # again because prior steps could have exceeded 5 hours + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} diff --git a/.github/workflows/lint-release-pr.yml b/.github/workflows/lint-release-pr.yml new file mode 100644 index 0000000000..b7d010f66d --- /dev/null +++ b/.github/workflows/lint-release-pr.yml @@ -0,0 +1,24 @@ +name: Lint Release PR + +on: + pull_request: + branches: + - release + - release-proxy + - release-compute + +jobs: + lint-release-pr: + runs-on: ubuntu-22.04 + steps: + - name: Checkout PR branch + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch full history for git operations + ref: ${{ github.event.pull_request.head.ref }} + + - name: Run lint script + env: + RELEASE_BRANCH: ${{ github.base_ref }} + run: | + ./.github/scripts/lint-release-pr.sh diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 90318747b3..558aba1e2e 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -71,8 +71,8 @@ jobs: uses: ./.github/workflows/build-macos.yml with: pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} - rebuild_rust_code: ${{ fromJson(needs.files-changed.outputs.rebuild_rust_code) }} - rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }} + rebuild_rust_code: ${{ fromJSON(needs.files-changed.outputs.rebuild_rust_code) }} + rebuild_everything: ${{ fromJSON(needs.files-changed.outputs.rebuild_everything) }} gather-rust-build-stats: needs: [ check-permissions, build-build-tools-image, files-changed ] @@ -90,8 +90,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init env: diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index f854bf3212..433b969b0c 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -34,10 +34,10 @@ jobs: pull-requests: write runs-on: [ self-hosted, small ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init timeout-minutes: 360 # Set the timeout to 6 hours env: diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index abc90c7fe1..cb5ae556d8 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -53,8 +53,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init --user root services: clickhouse: @@ -153,8 +153,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init --user root steps: diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index d2588ba0bf..ddeefe0128 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -46,8 +46,8 @@ jobs: FROM_TAG: ${{ inputs.from-tag }} TO_TAG: pinned run: | - docker manifest inspect "docker.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json" - docker manifest inspect "docker.io/neondatabase/build-tools:${TO_TAG}" > "${TO_TAG}.json" + docker manifest inspect "ghcr.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json" + docker manifest inspect "ghcr.io/neondatabase/build-tools:${TO_TAG}" > "${TO_TAG}.json" if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then skip=true @@ -71,13 +71,13 @@ jobs: with: image-map: | { - "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ + "ghcr.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ "docker.io/neondatabase/build-tools:pinned-bullseye", "ghcr.io/neondatabase/build-tools:pinned-bullseye", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye" ], - "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ + "ghcr.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ "docker.io/neondatabase/build-tools:pinned-bookworm", "docker.io/neondatabase/build-tools:pinned", "ghcr.io/neondatabase/build-tools:pinned-bookworm", diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index c47b3fe0de..bbe4638235 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -8,8 +8,6 @@ on: - .github/workflows/build-build-tools-image.yml - .github/workflows/pre-merge-checks.yml merge_group: - branches: - - main defaults: run: @@ -19,15 +17,19 @@ defaults: permissions: {} jobs: - get-changed-files: + meta: runs-on: ubuntu-22.04 + permissions: + contents: read outputs: python-changed: ${{ steps.python-src.outputs.any_changed }} rust-changed: ${{ steps.rust-src.outputs.any_changed }} + branch: ${{ steps.group-metadata.outputs.branch }} + pr-number: ${{ steps.group-metadata.outputs.pr-number }} steps: - uses: actions/checkout@v4 - - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 + - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 id: python-src with: files: | @@ -38,7 +40,7 @@ jobs: poetry.lock pyproject.toml - - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 + - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 id: rust-src with: files: | @@ -58,12 +60,23 @@ jobs: echo "${PYTHON_CHANGED_FILES}" echo "${RUST_CHANGED_FILES}" + - name: Merge group metadata + if: ${{ github.event_name == 'merge_group' }} + id: group-metadata + env: + MERGE_QUEUE_REF: ${{ github.event.merge_group.head_ref }} + run: | + echo $MERGE_QUEUE_REF | jq -Rr 'capture("refs/heads/gh-readonly-queue/(?.*)/pr-(?[0-9]+)-[0-9a-f]{40}") | ["branch=" + .branch, "pr-number=" + .pr_number] | .[]' | tee -a "${GITHUB_OUTPUT}" + build-build-tools-image: if: | false - || needs.get-changed-files.outputs.python-changed == 'true' - || needs.get-changed-files.outputs.rust-changed == 'true' - needs: [ get-changed-files ] + || needs.meta.outputs.python-changed == 'true' + || needs.meta.outputs.rust-changed == 'true' + needs: [ meta ] + permissions: + contents: read + packages: write uses: ./.github/workflows/build-build-tools-image.yml with: # Build only one combination to save time @@ -72,8 +85,11 @@ jobs: secrets: inherit check-codestyle-python: - if: needs.get-changed-files.outputs.python-changed == 'true' - needs: [ get-changed-files, build-build-tools-image ] + if: needs.meta.outputs.python-changed == 'true' + needs: [ meta, build-build-tools-image ] + permissions: + contents: read + packages: read uses: ./.github/workflows/_check-codestyle-python.yml with: # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` @@ -81,8 +97,11 @@ jobs: secrets: inherit check-codestyle-rust: - if: needs.get-changed-files.outputs.rust-changed == 'true' - needs: [ get-changed-files, build-build-tools-image ] + if: needs.meta.outputs.rust-changed == 'true' + needs: [ meta, build-build-tools-image ] + permissions: + contents: read + packages: read uses: ./.github/workflows/_check-codestyle-rust.yml with: # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` @@ -101,7 +120,7 @@ jobs: statuses: write # for `github.repos.createCommitStatus(...)` contents: write needs: - - get-changed-files + - meta - check-codestyle-python - check-codestyle-rust runs-on: ubuntu-22.04 @@ -129,7 +148,20 @@ jobs: run: exit 1 if: | false - || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true') - || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true') + || (github.event_name == 'merge_group' && needs.meta.outputs.branch != 'main') + || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.python-changed == 'true') + || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.rust-changed == 'true') || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') + + - name: Add fast-forward label to PR to trigger fast-forward merge + if: >- + ${{ + always() + && github.event_name == 'merge_group' + && contains(fromJSON('["release", "release-proxy", "release-compute"]'), needs.meta.outputs.branch) + }} + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: >- + gh pr edit ${{ needs.meta.outputs.pr-number }} --repo "${GITHUB_REPOSITORY}" --add-label "fast-forward" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 919846ce44..a88ddecd0a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -38,7 +38,7 @@ jobs: uses: ./.github/workflows/_create-release-pr.yml with: component-name: 'Storage' - release-branch: 'release' + source-branch: ${{ github.ref_name }} secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} @@ -51,7 +51,7 @@ jobs: uses: ./.github/workflows/_create-release-pr.yml with: component-name: 'Proxy' - release-branch: 'release-proxy' + source-branch: ${{ github.ref_name }} secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} @@ -64,6 +64,6 @@ jobs: uses: ./.github/workflows/_create-release-pr.yml with: component-name: 'Compute' - release-branch: 'release-compute' + source-branch: ${{ github.ref_name }} secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} diff --git a/Cargo.lock b/Cargo.lock index 778ff19fec..a8e400524e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -167,45 +167,6 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" -[[package]] -name = "asn1-rs" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048" -dependencies = [ - "asn1-rs-derive", - "asn1-rs-impl", - "displaydoc", - "nom", - "num-traits", - "rusticata-macros", - "thiserror 1.0.69", - "time", -] - -[[package]] -name = "asn1-rs-derive" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.90", - "synstructure", -] - -[[package]] -name = "asn1-rs-impl" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.90", -] - [[package]] name = "assert-json-diff" version = "2.0.2" @@ -272,7 +233,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -283,7 +244,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1021,7 +982,7 @@ dependencies = [ "regex", "rustc-hash 2.1.1", "shlex", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1248,7 +1209,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1309,6 +1270,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", + "indexmap 2.0.1", "jsonwebtoken", "regex", "remote_storage", @@ -1339,6 +1301,7 @@ dependencies = [ "flate2", "futures", "http 1.1.0", + "indexmap 2.0.1", "jsonwebtoken", "metrics", "nix 0.27.1", @@ -1347,17 +1310,20 @@ dependencies = [ "once_cell", "opentelemetry", "opentelemetry_sdk", + "p256 0.13.2", "postgres", "postgres_initdb", "regex", "remote_storage", "reqwest", + "ring", "rlimit", "rust-ini", "serde", "serde_json", "serde_with", "signal-hook", + "spki 0.7.3", "tar", "thiserror 1.0.69", "tokio", @@ -1377,6 +1343,7 @@ dependencies = [ "vm_monitor", "walkdir", "workspace_hack", + "x509-cert", "zstd", ] @@ -1703,7 +1670,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1727,7 +1694,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.10.0", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1738,7 +1705,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1801,22 +1768,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" dependencies = [ "const-oid", + "der_derive", + "flagset", "pem-rfc7468", "zeroize", ] [[package]] -name = "der-parser" -version = "9.0.0" +name = "der_derive" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553" +checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18" dependencies = [ - "asn1-rs", - "displaydoc", - "nom", - "num-bigint", - "num-traits", - "rusticata-macros", + "proc-macro2", + "quote", + "syn 2.0.100", ] [[package]] @@ -1888,7 +1854,7 @@ dependencies = [ "dsl_auto_type", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1908,7 +1874,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1937,7 +1903,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1960,7 +1926,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2105,7 +2071,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2115,28 +2081,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" dependencies = [ "log", -] - -[[package]] -name = "env_logger" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" -dependencies = [ - "humantime", - "is-terminal", - "log", "regex", - "termcolor", ] [[package]] name = "env_logger" -version = "0.11.2" +version = "0.11.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +checksum = "c3716d7a920fb4fac5d84e9d4bce8ceb321e9414b4409da61b07b75c1e3d0697" dependencies = [ + "anstream", + "anstyle", "env_filter", + "jiff", "log", ] @@ -2157,7 +2114,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2291,6 +2248,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "flagset" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3ea1ec5f8307826a5b71094dd91fc04d4ae75d5709b20ad351c7fb4815c86ec" + [[package]] name = "flate2" version = "1.0.26" @@ -2417,7 +2380,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2530,7 +2493,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2847,6 +2810,7 @@ version = "0.1.0" dependencies = [ "anyhow", "bytes", + "camino", "fail", "futures", "hyper 0.14.30", @@ -2857,6 +2821,7 @@ dependencies = [ "pprof", "regex", "routerify", + "rustls-pemfile 2.1.1", "serde", "serde_json", "serde_path_to_error", @@ -2886,9 +2851,9 @@ checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" [[package]] name = "humantime" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" [[package]] name = "humantime-serde" @@ -3148,7 +3113,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -3241,7 +3206,7 @@ dependencies = [ "crossbeam-channel", "crossbeam-utils", "dashmap 6.1.0", - "env_logger 0.11.2", + "env_logger", "indexmap 2.0.1", "itoa", "log", @@ -3254,11 +3219,11 @@ dependencies = [ [[package]] name = "inotify" -version = "0.11.0" +version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" dependencies = [ - "bitflags 2.8.0", + "bitflags 1.3.2", "inotify-sys", "libc", ] @@ -3364,6 +3329,30 @@ dependencies = [ "tracing", ] +[[package]] +name = "jiff" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d699bc6dfc879fb1bf9bdff0d4c56f0884fc6f0d0eb0fba397a6d00cd9a6b85e" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", +] + +[[package]] +name = "jiff-static" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d16e75759ee0aa64c57a56acbf43916987b20c77373cb7e808979e02b93c9f9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -3535,9 +3524,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.20" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" [[package]] name = "lru" @@ -3618,7 +3607,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -3732,6 +3721,18 @@ dependencies = [ "adler2", ] +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "log", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.48.0", +] + [[package]] name = "mio" version = "1.0.3" @@ -3739,7 +3740,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", - "log", "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] @@ -3817,29 +3817,23 @@ checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21" [[package]] name = "notify" -version = "8.0.0" +version = "6.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943" +checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d" dependencies = [ "bitflags 2.8.0", + "crossbeam-channel", "filetime", "fsevent-sys", "inotify", "kqueue", "libc", "log", - "mio", - "notify-types", + "mio 0.8.11", "walkdir", - "windows-sys 0.59.0", + "windows-sys 0.48.0", ] -[[package]] -name = "notify-types" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d" - [[package]] name = "ntapi" version = "0.4.1" @@ -3997,15 +3991,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "oid-registry" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9" -dependencies = [ - "asn1-rs", -] - [[package]] name = "once_cell" version = "1.20.2" @@ -4282,8 +4267,6 @@ dependencies = [ "reqwest", "rpds", "rustls 0.23.18", - "rustls-pemfile 2.1.1", - "rustls-pki-types", "scopeguard", "send-future", "serde", @@ -4308,6 +4291,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "tracing-utils", "url", "utils", "uuid", @@ -4485,7 +4469,7 @@ dependencies = [ "parquet", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -4587,7 +4571,7 @@ checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -4683,6 +4667,15 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "postgres" version = "0.19.7" @@ -4790,7 +4783,7 @@ dependencies = [ "bytes", "crc32c", "criterion", - "env_logger 0.10.2", + "env_logger", "log", "memoffset 0.9.0", "once_cell", @@ -4889,7 +4882,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -4903,9 +4896,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.92" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" dependencies = [ "unicode-ident", ] @@ -4980,7 +4973,7 @@ checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", "heck", - "itertools 0.10.5", + "itertools 0.12.1", "log", "multimap", "once_cell", @@ -4989,7 +4982,7 @@ dependencies = [ "prost 0.12.6", "prost-types 0.12.6", "regex", - "syn 2.0.90", + "syn 2.0.100", "tempfile", ] @@ -5010,7 +5003,7 @@ dependencies = [ "prost 0.13.3", "prost-types 0.13.3", "regex", - "syn 2.0.90", + "syn 2.0.100", "tempfile", ] @@ -5021,10 +5014,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -5037,7 +5030,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -5084,7 +5077,7 @@ dependencies = [ "consumption_metrics", "ecdsa 0.16.9", "ed25519-dalek", - "env_logger 0.10.2", + "env_logger", "fallible-iterator", "flate2", "framed-websockets", @@ -5172,7 +5165,7 @@ dependencies = [ "uuid", "walkdir", "workspace_hack", - "x509-parser", + "x509-cert", "zerocopy", ] @@ -5221,9 +5214,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.37" +version = "1.0.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" dependencies = [ "proc-macro2", ] @@ -5752,7 +5745,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.90", + "syn 2.0.100", "unicode-ident", ] @@ -5793,15 +5786,6 @@ dependencies = [ "semver", ] -[[package]] -name = "rusticata-macros" -version = "4.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632" -dependencies = [ - "nom", -] - [[package]] name = "rustix" version = "0.38.41" @@ -5967,7 +5951,7 @@ dependencies = [ "crc32c", "criterion", "desim", - "env_logger 0.10.2", + "env_logger", "fail", "futures", "hex", @@ -5989,6 +5973,7 @@ dependencies = [ "regex", "remote_storage", "reqwest", + "rustls 0.23.18", "safekeeper_api", "safekeeper_client", "scopeguard", @@ -6005,6 +5990,7 @@ dependencies = [ "tokio", "tokio-io-timeout", "tokio-postgres", + "tokio-rustls 0.26.0", "tokio-stream", "tokio-tar", "tokio-util", @@ -6298,7 +6284,7 @@ checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6380,7 +6366,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6395,9 +6381,9 @@ dependencies = [ [[package]] name = "sha1" -version = "0.10.5" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", "cpufeatures", @@ -6782,7 +6768,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6833,9 +6819,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.90" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", @@ -6865,7 +6851,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6916,15 +6902,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "termcolor" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" -dependencies = [ - "winapi-util", -] - [[package]] name = "test-context" version = "0.3.0" @@ -6943,7 +6920,7 @@ checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6972,7 +6949,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6983,7 +6960,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7114,6 +7091,27 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tls_codec" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de2e01245e2bb89d6f05801c564fa27624dbd7b1846859876c7dad82e90bf6b" +dependencies = [ + "tls_codec_derive", + "zeroize", +] + +[[package]] +name = "tls_codec_derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2e76690929402faae40aebdda620a2c0e25dd6d3b9afe48867dfd95991f4bd" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "tokio" version = "1.43.0" @@ -7123,7 +7121,7 @@ dependencies = [ "backtrace", "bytes", "libc", - "mio", + "mio 1.0.3", "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", @@ -7166,7 +7164,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7399,7 +7397,7 @@ dependencies = [ "prost-build 0.13.3", "prost-types 0.13.3", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7514,7 +7512,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7845,6 +7843,7 @@ dependencies = [ "tracing", "tracing-error", "tracing-subscriber", + "tracing-utils", "walkdir", ] @@ -7908,7 +7907,7 @@ dependencies = [ "anyhow", "camino-tempfile", "clap", - "env_logger 0.10.2", + "env_logger", "log", "postgres", "postgres_ffi", @@ -8013,7 +8012,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "wasm-bindgen-shared", ] @@ -8047,7 +8046,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -8354,6 +8353,7 @@ name = "workspace_hack" version = "0.1.0" dependencies = [ "ahash", + "anstream", "anyhow", "base64 0.13.1", "base64 0.21.7", @@ -8364,12 +8364,16 @@ dependencies = [ "chrono", "clap", "clap_builder", + "const-oid", "crypto-bigint 0.5.5", "der 0.7.8", "deranged", "digest", - "displaydoc", + "ecdsa 0.16.9", "either", + "elliptic-curve 0.13.8", + "env_filter", + "env_logger", "fail", "form_urlencoded", "futures-channel", @@ -8387,7 +8391,6 @@ dependencies = [ "hyper-util", "indexmap 1.9.3", "indexmap 2.0.1", - "itertools 0.10.5", "itertools 0.12.1", "lazy_static", "libc", @@ -8403,6 +8406,7 @@ dependencies = [ "num-rational", "num-traits", "once_cell", + "p256 0.13.2", "parquet", "prettyplease", "proc-macro2", @@ -8415,6 +8419,7 @@ dependencies = [ "reqwest", "rustls 0.23.18", "scopeguard", + "sec1 0.7.3", "serde", "serde_json", "sha2", @@ -8423,7 +8428,7 @@ dependencies = [ "spki 0.7.3", "stable_deref_trait", "subtle", - "syn 2.0.90", + "syn 2.0.100", "sync_wrapper 0.1.2", "tikv-jemalloc-ctl", "tikv-jemalloc-sys", @@ -8460,6 +8465,18 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" +[[package]] +name = "x509-cert" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1301e935010a701ae5f8655edc0ad17c44bad3ac5ce8c39185f75453b720ae94" +dependencies = [ + "const-oid", + "der 0.7.8", + "spki 0.7.3", + "tls_codec", +] + [[package]] name = "x509-certificate" version = "0.23.1" @@ -8479,23 +8496,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "x509-parser" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69" -dependencies = [ - "asn1-rs", - "data-encoding", - "der-parser", - "lazy_static", - "nom", - "oid-registry", - "rusticata-macros", - "thiserror 1.0.69", - "time", -] - [[package]] name = "xattr" version = "1.0.0" @@ -8540,7 +8540,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "synstructure", ] @@ -8562,7 +8562,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -8582,15 +8582,15 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "synstructure", ] [[package]] name = "zeroize" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" dependencies = [ "serde", "zeroize_derive", @@ -8604,7 +8604,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -8626,7 +8626,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index c59c4c5435..9bbc5a1a38 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -106,13 +106,13 @@ hostname = "0.4" http = {version = "1.1.0", features = ["std"]} http-types = { version = "2", default-features = false } http-body-util = "0.1.2" -humantime = "2.1" +humantime = "2.2" humantime-serde = "1.1.1" hyper0 = { package = "hyper", version = "0.14" } hyper = "1.4" hyper-util = "0.1" tokio-tungstenite = "0.21.0" -indexmap = "2" +indexmap = { version = "2", features = ["serde"] } indoc = "2" ipnet = "2.10.0" itertools = "0.10" @@ -126,7 +126,9 @@ measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } -notify = "8.0.0" +# Do not update to >= 7.0.0, at least. The update will have a significant impact +# on compute startup metrics (start_postgres_ms), >= 25% degradation. +notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" @@ -213,13 +215,13 @@ urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" rustls-native-certs = "0.8" -x509-parser = "0.16" whoami = "1.5.1" zerocopy = { version = "0.7", features = ["derive"] } json-structural-diff = { version = "0.2.0" } +x509-cert = { version = "0.2.5" } ## TODO replace this with tracing -env_logger = "0.10" +env_logger = "0.11" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed diff --git a/Dockerfile b/Dockerfile index 83ad86badb..01540e1925 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ ### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters. ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used ### inside this image in the real deployments. -ARG REPOSITORY=neondatabase +ARG REPOSITORY=ghcr.io/neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG DEFAULT_PG_VERSION=17 diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 6e46185e36..bdc73ab174 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -77,7 +77,7 @@ # build_and_test.yml github workflow for how that's done. ARG PG_VERSION -ARG REPOSITORY=neondatabase +ARG REPOSITORY=ghcr.io/neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG @@ -1735,6 +1735,8 @@ RUN set -e \ libevent-dev \ libtool \ pkg-config \ + libcurl4-openssl-dev \ + libssl-dev \ && apt clean && rm -rf /var/lib/apt/lists/* # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) @@ -1743,7 +1745,7 @@ RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ - && ./configure --prefix=/usr/local/pgbouncer --without-openssl \ + && ./configure --prefix=/usr/local/pgbouncer \ && make -j $(nproc) dist_man_MANS= \ && make install dist_man_MANS= diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index e6707381ac..f63aa88da2 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -39,6 +39,13 @@ commands: user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' + # Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also. + # We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to + # use a different path for the socket. The symlink actually points to our custom path. + - name: rsyslogd-socket-symlink + user: root + sysvInitAction: sysinit + shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log" - name: rsyslogd user: postgres sysvInitAction: respawn @@ -77,6 +84,9 @@ files: # compute_ctl will rewrite this file with the actual configuration, if needed. - filename: compute_rsyslog.conf content: | + # Syslock.Name specifies a non-default pipe location that is writeable for the postgres user. + module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging + *.* /dev/null $IncludeConfig /etc/rsyslog.d/*.conf build: | @@ -145,7 +155,7 @@ merge: | COPY compute_rsyslog.conf /etc/compute_rsyslog.conf RUN chmod 0666 /etc/compute_rsyslog.conf - RUN chmod 0666 /var/log/ + RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index c89ee112dc..8b3c681228 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -39,6 +39,13 @@ commands: user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' + # Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also. + # We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to + # use a different path for the socket. The symlink actually points to our custom path. + - name: rsyslogd-socket-symlink + user: root + sysvInitAction: sysinit + shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log" - name: rsyslogd user: postgres sysvInitAction: respawn @@ -77,6 +84,9 @@ files: # compute_ctl will rewrite this file with the actual configuration, if needed. - filename: compute_rsyslog.conf content: | + # Syslock.Name specifies a non-default pipe location that is writeable for the postgres user. + module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging + *.* /dev/null $IncludeConfig /etc/rsyslog.d/*.conf build: | @@ -140,7 +150,7 @@ merge: | COPY compute_rsyslog.conf /etc/compute_rsyslog.conf RUN chmod 0666 /etc/compute_rsyslog.conf - RUN chmod 0666 /var/log/ + RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index dd2896714d..d80ec41d34 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -26,6 +26,7 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +indexmap.workspace = true jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true @@ -34,16 +35,19 @@ num_cpus.workspace = true once_cell.workspace = true opentelemetry.workspace = true opentelemetry_sdk.workspace = true +p256 = { version = "0.13", features = ["pem"] } postgres.workspace = true regex.workspace = true +reqwest = { workspace = true, features = ["json"] } +ring = "0.17" serde.workspace = true serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true +spki = { version = "0.7.3", features = ["std"] } tar.workspace = true tower.workspace = true tower-http.workspace = true -reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true @@ -57,6 +61,7 @@ thiserror.workspace = true url.workspace = true uuid.workspace = true walkdir.workspace = true +x509-cert.workspace = true postgres_initdb.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 354528e2cd..d31472b0c1 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -37,10 +37,14 @@ use crate::logger::startup_context_from_env; use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; use crate::monitor::launch_monitor; use crate::pg_helpers::*; -use crate::rsyslog::configure_audit_rsyslog; +use crate::rsyslog::{ + PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export, + launch_pgaudit_gc, +}; use crate::spec::*; use crate::swap::resize_swap; use crate::sync_sk::{check_if_synced, ping_safekeeper}; +use crate::tls::watch_cert_for_changes; use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); @@ -112,6 +116,7 @@ pub struct ComputeNode { // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, + pub compute_ctl_config: ComputeCtlConfig, } // store some metrics about download size that might impact startup time @@ -135,8 +140,6 @@ pub struct ComputeState { /// passed by the control plane with a /configure HTTP request. pub pspec: Option, - pub compute_ctl_config: ComputeCtlConfig, - /// If the spec is passed by a /configure request, 'startup_span' is the /// /configure request's tracing span. The main thread enters it when it /// processes the compute startup, so that the compute startup is considered @@ -160,7 +163,6 @@ impl ComputeState { last_active: None, error: None, pspec: None, - compute_ctl_config: ComputeCtlConfig::default(), startup_span: None, metrics: ComputeMetrics::default(), } @@ -314,7 +316,6 @@ impl ComputeNode { let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?; new_state.pspec = Some(pspec); } - new_state.compute_ctl_config = compute_ctl_config; Ok(ComputeNode { params, @@ -323,6 +324,7 @@ impl ComputeNode { state: Mutex::new(new_state), state_changed: Condvar::new(), ext_download_progress: RwLock::new(HashMap::new()), + compute_ctl_config, }) } @@ -345,7 +347,7 @@ impl ComputeNode { // requests while configuration is still in progress. crate::http::server::Server::External { port: this.params.external_http_port, - jwks: this.state.lock().unwrap().compute_ctl_config.jwks.clone(), + config: this.compute_ctl_config.clone(), compute_id: this.params.compute_id.clone(), } .launch(&this); @@ -524,6 +526,16 @@ impl ComputeNode { // Collect all the tasks that must finish here let mut pre_tasks = tokio::task::JoinSet::new(); + // Make sure TLS certificates are properly loaded and in the right place. + if self.compute_ctl_config.tls.is_some() { + let this = self.clone(); + pre_tasks.spawn(async move { + this.watch_cert_for_changes().await; + + Ok::<(), anyhow::Error>(()) + }); + } + // If there are any remote extensions in shared_preload_libraries, start downloading them if pspec.spec.remote_extensions.is_some() { let (this, spec) = (self.clone(), pspec.spec.clone()); @@ -579,11 +591,13 @@ impl ComputeNode { if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { info!("tuning pgbouncer"); + let pgbouncer_settings = pgbouncer_settings.clone(); + let tls_config = self.compute_ctl_config.tls.clone(); + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. - let pgbouncer_settings = pgbouncer_settings.clone(); let _handle = tokio::spawn(async move { - let res = tune_pgbouncer(pgbouncer_settings).await; + let res = tune_pgbouncer(pgbouncer_settings, tls_config).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); // Continue with the startup anyway @@ -606,7 +620,7 @@ impl ComputeNode { }); } - // Configure and start rsyslog if necessary + // Configure and start rsyslog for HIPAA if necessary if let ComputeAudit::Hipaa = pspec.spec.audit_log_level { let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); if remote_endpoint.is_empty() { @@ -614,13 +628,22 @@ impl ComputeNode { } let log_directory_path = Path::new(&self.params.pgdata).join("log"); - // TODO: make this more robust - // now rsyslog starts once and there is no monitoring or restart if it fails - configure_audit_rsyslog( - log_directory_path.to_str().unwrap(), - "hipaa", - &remote_endpoint, - )?; + let log_directory_path = log_directory_path.to_string_lossy().to_string(); + configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; + + // Launch a background task to clean up the audit logs + launch_pgaudit_gc(log_directory_path); + } + + // Configure and start rsyslog for Postgres logs export + if self.has_feature(ComputeFeature::PostgresLogsExport) { + if let Some(ref project_id) = pspec.spec.cluster.cluster_id { + let host = PostgresLogsRsyslogConfig::default_host(project_id); + let conf = PostgresLogsRsyslogConfig::new(Some(&host)); + configure_postgres_logs_export(conf)?; + } else { + warn!("not configuring rsyslog for Postgres logs export: project ID is missing") + } } // Launch remaining service threads @@ -645,9 +668,9 @@ impl ComputeNode { if pspec.spec.mode == ComputeMode::Primary { self.configure_as_primary(&compute_state)?; - let conf = self.get_conn_conf(None); - tokio::task::spawn_blocking(|| { - let res = get_installed_extensions(conf); + let conf = self.get_tokio_conn_conf(None); + tokio::task::spawn(async { + let res = get_installed_extensions(conf).await; match res { Ok(extensions) => { info!( @@ -1105,9 +1128,10 @@ impl ComputeNode { // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf( - &pgdata_path.join("postgresql.conf"), + pgdata_path, &pspec.spec, self.params.internal_http_port, + &self.compute_ctl_config.tls, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -1489,11 +1513,13 @@ impl ComputeNode { if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); + let pgbouncer_settings = pgbouncer_settings.clone(); + let tls_config = self.compute_ctl_config.tls.clone(); + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. - let pgbouncer_settings = pgbouncer_settings.clone(); tokio::spawn(async move { - let res = tune_pgbouncer(pgbouncer_settings).await; + let res = tune_pgbouncer(pgbouncer_settings, tls_config).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -1505,7 +1531,8 @@ impl ComputeNode { // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. - let local_proxy = local_proxy.clone(); + let mut local_proxy = local_proxy.clone(); + local_proxy.tls = self.compute_ctl_config.tls.clone(); tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); @@ -1515,8 +1542,12 @@ impl ComputeNode { // Write new config let pgdata_path = Path::new(&self.params.pgdata); - let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - config::write_postgres_conf(&postgresql_conf_path, &spec, self.params.internal_http_port)?; + config::write_postgres_conf( + pgdata_path, + &spec, + self.params.internal_http_port, + &self.compute_ctl_config.tls, + )?; if !spec.skip_pg_catalog_updates { let max_concurrent_connections = spec.reconfigure_concurrency; @@ -1587,6 +1618,56 @@ impl ComputeNode { Ok(()) } + pub async fn watch_cert_for_changes(self: Arc) { + // update status on cert renewal + if let Some(tls_config) = &self.compute_ctl_config.tls { + let tls_config = tls_config.clone(); + + // wait until the cert exists. + let mut cert_watch = watch_cert_for_changes(tls_config.cert_path.clone()).await; + + tokio::task::spawn_blocking(move || { + let handle = tokio::runtime::Handle::current(); + 'cert_update: loop { + // let postgres/pgbouncer/local_proxy know the new cert/key exists. + // we need to wait until it's configurable first. + + let mut state = self.state.lock().unwrap(); + 'status_update: loop { + match state.status { + // let's update the state to config pending + ComputeStatus::ConfigurationPending | ComputeStatus::Running => { + state.set_status( + ComputeStatus::ConfigurationPending, + &self.state_changed, + ); + break 'status_update; + } + + // exit loop + ComputeStatus::Failed + | ComputeStatus::TerminationPending + | ComputeStatus::Terminated => break 'cert_update, + + // wait + ComputeStatus::Init + | ComputeStatus::Configuration + | ComputeStatus::Empty => { + state = self.state_changed.wait(state).unwrap(); + } + } + } + drop(state); + + // wait for a new certificate update + if handle.block_on(cert_watch.changed()).is_err() { + break; + } + } + }); + } + } + /// Update the `last_active` in the shared state, but ensure that it's a more recent one. pub fn update_last_active(&self, last_active: Option>) { let mut state = self.state.lock().unwrap(); diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 0760568ff8..290632e4cd 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,11 +6,13 @@ use std::io::Write; use std::io::prelude::*; use std::path::Path; -use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption}; +use compute_api::responses::TlsConfig; +use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption}; use crate::pg_helpers::{ GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, }; +use crate::tls::{self, SERVER_CRT, SERVER_KEY}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -38,10 +40,12 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { /// Create or completely rewrite configuration file specified by `path` pub fn write_postgres_conf( - path: &Path, + pgdata_path: &Path, spec: &ComputeSpec, extension_server_port: u16, + tls_config: &Option, ) -> Result<()> { + let path = pgdata_path.join("postgresql.conf"); // File::create() destroys the file content if it exists. let mut file = File::create(path)?; @@ -86,6 +90,20 @@ pub fn write_postgres_conf( )?; } + // tls + if let Some(tls_config) = tls_config { + writeln!(file, "ssl = on")?; + + // postgres requires the keyfile to be in a secure file, + // currently too complicated to ensure that at the VM level, + // so we just copy them to another file instead. :shrug: + tls::update_key_path_blocking(pgdata_path, tls_config); + + // these are the default, but good to be explicit. + writeln!(file, "ssl_cert_file = '{}'", SERVER_CRT)?; + writeln!(file, "ssl_key_file = '{}'", SERVER_KEY)?; + } + // Locales if cfg!(target_os = "macos") { writeln!(file, "lc_messages='C'")?; @@ -149,7 +167,8 @@ pub fn write_postgres_conf( writeln!(file, "# Managed by compute_ctl audit settings: begin")?; // This log level is very verbose // but this is necessary for HIPAA compliance. - writeln!(file, "pgaudit.log='all'")?; + // Exclude 'misc' category, because it doesn't contain anythig relevant. + writeln!(file, "pgaudit.log='all, -misc'")?; writeln!(file, "pgaudit.log_parameter=on")?; // Disable logging of catalog queries // The catalog doesn't contain sensitive data, so we don't need to audit it. @@ -197,6 +216,12 @@ pub fn write_postgres_conf( writeln!(file, "neon.disable_logical_replication_subscribers=false")?; } + // We need Postgres to send logs to rsyslog so that we can forward them + // further to customers' log aggregation systems. + if spec.features.contains(&ComputeFeature::PostgresLogsExport) { + writeln!(file, "log_destination='stderr,syslog'")?; + } + // This is essential to keep this line at the end of the file, // because it is intended to override any settings above. writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?; diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf index bef3c36446..9ca7e36738 100644 --- a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf +++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf @@ -4,7 +4,8 @@ module(load="imfile") # Input configuration for log files in the specified directory # Replace {log_directory} with the directory containing the log files input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0") -global(workDirectory="/var/log") +# the directory to store rsyslog state files +global(workDirectory="/var/log/rsyslog") # Forward logs to remote syslog server -*.* @@{remote_endpoint} \ No newline at end of file +*.* @@{remote_endpoint} diff --git a/compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf b/compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf new file mode 100644 index 0000000000..2580b61fea --- /dev/null +++ b/compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf @@ -0,0 +1,10 @@ +# Program name comes from postgres' syslog_facility configuration: https://www.postgresql.org/docs/current/runtime-config-logging.html#GUC-SYSLOG-IDENT +# Default value is 'postgres'. +if $programname == 'postgres' then {{ + # Forward Postgres logs to telemetry otel collector + action(type="omfwd" target="{logs_export_target}" port="{logs_export_port}" protocol="tcp" + template="RSYSLOG_SyslogProtocol23Format" + action.resumeRetryCount="3" + queue.type="linkedList" queue.size="1000") + stop +}} diff --git a/compute_tools/src/http/middleware/mod.rs b/compute_tools/src/http/middleware/mod.rs index caeeeedfe5..147d6d2c7d 100644 --- a/compute_tools/src/http/middleware/mod.rs +++ b/compute_tools/src/http/middleware/mod.rs @@ -1 +1,2 @@ pub(in crate::http) mod authorize; +pub(in crate::http) mod request_id; diff --git a/compute_tools/src/http/middleware/request_id.rs b/compute_tools/src/http/middleware/request_id.rs new file mode 100644 index 0000000000..e685b27d91 --- /dev/null +++ b/compute_tools/src/http/middleware/request_id.rs @@ -0,0 +1,16 @@ +use axum::{extract::Request, middleware::Next, response::Response}; +use uuid::Uuid; + +use crate::http::headers::X_REQUEST_ID; + +/// This middleware function allows compute_ctl to generate its own request ID +/// if one isn't supplied. The control plane will always send one as a UUID. The +/// neon Postgres extension on the other hand does not send one. +pub async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { + let headers = request.headers_mut(); + if !headers.contains_key(X_REQUEST_ID) { + headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); + } + + next.run(request).await +} diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index bbdb7d0917..7c8f72440f 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -306,6 +306,36 @@ paths: schema: $ref: "#/components/schemas/GenericError" + /configure_telemetry: + post: + tags: + - Configure + summary: Configure rsyslog + description: | + This API endpoint configures rsyslog to forward Postgres logs + to a specified otel collector. + operationId: configureTelemetry + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + logs_export_host: + type: string + description: | + Hostname and the port of the otel collector. Leave empty to disable logs forwarding. + Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526 + responses: + 204: + description: "Telemetry configured successfully" + 500: + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + components: securitySchemes: JWT: diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index 3c5a6a6d41..5c9dd22c3d 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -1,9 +1,11 @@ use std::sync::Arc; +use axum::body::Body; use axum::extract::State; use axum::response::Response; -use compute_api::requests::ConfigurationRequest; +use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest}; use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; +use compute_api::spec::ComputeFeature; use http::StatusCode; use tokio::task; use tracing::info; @@ -11,6 +13,7 @@ use tracing::info; use crate::compute::{ComputeNode, ParsedSpec}; use crate::http::JsonResponse; use crate::http::extract::Json; +use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export}; // Accept spec in JSON format and request compute configuration. If anything // goes wrong after we set the compute status to `ConfigurationPending` and @@ -92,3 +95,25 @@ pub(in crate::http) async fn configure( JsonResponse::success(StatusCode::OK, body) } + +pub(in crate::http) async fn configure_telemetry( + State(compute): State>, + request: Json, +) -> Response { + if !compute.has_feature(ComputeFeature::PostgresLogsExport) { + return JsonResponse::error( + StatusCode::PRECONDITION_FAILED, + "Postgres logs export feature is not enabled".to_string(), + ); + } + + let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref()); + if let Err(err) = configure_postgres_logs_export(conf) { + return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string()); + } + + Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::from("")) + .unwrap() +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 126fa86d1c..179369e3ef 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -5,20 +5,19 @@ use std::time::Duration; use anyhow::Result; use axum::Router; -use axum::extract::Request; -use axum::middleware::{self, Next}; -use axum::response::{IntoResponse, Response}; +use axum::middleware::{self}; +use axum::response::IntoResponse; use axum::routing::{get, post}; +use compute_api::responses::ComputeCtlConfig; use http::StatusCode; -use jsonwebtoken::jwk::JwkSet; use tokio::net::TcpListener; use tower::ServiceBuilder; use tower_http::{ auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer, }; use tracing::{Span, error, info}; -use uuid::Uuid; +use super::middleware::request_id::maybe_add_request_id_header; use super::{ headers::X_REQUEST_ID, middleware::authorize::Authorize, @@ -42,7 +41,7 @@ pub enum Server { }, External { port: u16, - jwks: JwkSet, + config: ComputeCtlConfig, compute_id: String, }, } @@ -80,7 +79,7 @@ impl From<&Server> for Router> { router } Server::External { - jwks, compute_id, .. + config, compute_id, .. } => { let unauthenticated_router = Router::>::new().route("/metrics", get(metrics::get_metrics)); @@ -88,6 +87,7 @@ impl From<&Server> for Router> { let authenticated_router = Router::>::new() .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) + .route("/configure_telemetry", post(configure::configure_telemetry)) .route("/database_schema", get(database_schema::get_schema_dump)) .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) .route("/insights", get(insights::get_insights)) @@ -96,7 +96,7 @@ impl From<&Server> for Router> { .route("/terminate", post(terminate::terminate)) .layer(AsyncRequireAuthorizationLayer::new(Authorize::new( compute_id.clone(), - jwks.clone(), + config.jwks.clone(), ))); router @@ -219,15 +219,3 @@ impl Server { tokio::spawn(self.serve(state)); } } - -/// This middleware function allows compute_ctl to generate its own request ID -/// if one isn't supplied. The control plane will always send one as a UUID. The -/// neon Postgres extension on the other hand does not send one. -async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { - let headers = request.headers_mut(); - if headers.get(X_REQUEST_ID).is_none() { - headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); - } - - next.run(request).await -} diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 6921505466..d95c168a99 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use anyhow::Result; use compute_api::responses::{InstalledExtension, InstalledExtensions}; -use postgres::{Client, NoTls}; +use tokio_postgres::{Client, Config, NoTls}; use crate::metrics::INSTALLED_EXTENSIONS; @@ -10,7 +10,7 @@ use crate::metrics::INSTALLED_EXTENSIONS; /// and to make database listing query here more explicit. /// /// Limit the number of databases to 500 to avoid excessive load. -fn list_dbs(client: &mut Client) -> Result> { +async fn list_dbs(client: &mut Client) -> Result> { // `pg_database.datconnlimit = -2` means that the database is in the // invalid state let databases = client @@ -20,7 +20,8 @@ fn list_dbs(client: &mut Client) -> Result> { AND datconnlimit <> - 2 LIMIT 500", &[], - )? + ) + .await? .iter() .map(|row| { let db: String = row.get("datname"); @@ -36,20 +37,36 @@ fn list_dbs(client: &mut Client) -> Result> { /// Same extension can be installed in multiple databases with different versions, /// so we report a separate metric (number of databases where it is installed) /// for each extension version. -pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result { +pub async fn get_installed_extensions(mut conf: Config) -> Result { conf.application_name("compute_ctl:get_installed_extensions"); - let mut client = conf.connect(NoTls)?; - let databases: Vec = list_dbs(&mut client)?; + let databases: Vec = { + let (mut client, connection) = conf.connect(NoTls).await?; + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + list_dbs(&mut client).await? + }; let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new(); for db in databases.iter() { conf.dbname(db); - let mut db_client = conf.connect(NoTls)?; - let extensions: Vec<(String, String, i32)> = db_client + + let (client, connection) = conf.connect(NoTls).await?; + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let extensions: Vec<(String, String, i32)> = client .query( "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension", &[], - )? + ) + .await? .iter() .map(|row| { ( diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 5c78bbcd02..a681fad0b0 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -26,3 +26,4 @@ pub mod spec; mod spec_apply; pub mod swap; pub mod sync_sk; +pub mod tls; diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index a65614e94e..c36f302f99 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -24,7 +24,8 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result .with_writer(std::io::stderr); // Initialize OpenTelemetry - let otlp_layer = tracing_utils::init_tracing("compute_ctl").await; + let otlp_layer = + tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()).await; // Put it all together tracing_subscriber::registry() diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index dab32d5dc1..4caa48307e 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,6 +1,8 @@ -use metrics::core::Collector; +use metrics::core::{AtomicF64, Collector, GenericGauge}; use metrics::proto::MetricFamily; -use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; +use metrics::{ + IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec, +}; use once_cell::sync::Lazy; pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { @@ -59,10 +61,20 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy = Lazy::new(|| .expect("failed to define a metric") }); +// Size of audit log directory in bytes +pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy> = Lazy::new(|| { + register_gauge!( + "compute_audit_log_dir_size", + "Size of audit log directory in bytes", + ) + .expect("failed to define a metric") +}); + pub fn collect() -> Vec { let mut metrics = INSTALLED_EXTENSIONS.collect(); metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); metrics.extend(DB_MIGRATION_FAILED.collect()); + metrics.extend(AUDIT_LOG_DIR_SIZE.collect()); metrics } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index dd8d8e9b8b..10d8f2c878 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -10,8 +10,10 @@ use std::str::FromStr; use std::time::{Duration, Instant}; use anyhow::{Result, bail}; +use compute_api::responses::TlsConfig; use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; use futures::StreamExt; +use indexmap::IndexMap; use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::config::Config; @@ -206,8 +208,8 @@ impl Escaping for PgIdent { /// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`, /// fn pg_quote_dollar(&self) -> (String, String) { - let mut tag: String = "".to_string(); - let mut outer_tag = "x".to_string(); + let mut tag: String = "x".to_string(); + let mut outer_tag = "xx".to_string(); // Find the first suitable tag that is not present in the string. // Postgres' max role/DB name length is 63 bytes, so even in the @@ -406,7 +408,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> { /// Update pgbouncer.ini with provided options fn update_pgbouncer_ini( - pgbouncer_config: HashMap, + pgbouncer_config: IndexMap, pgbouncer_ini_path: &str, ) -> Result<()> { let mut conf = Ini::load_from_file(pgbouncer_ini_path)?; @@ -427,7 +429,10 @@ fn update_pgbouncer_ini( /// Tune pgbouncer. /// 1. Apply new config using pgbouncer admin console /// 2. Add new values to pgbouncer.ini to preserve them after restart -pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result<()> { +pub async fn tune_pgbouncer( + mut pgbouncer_config: IndexMap, + tls_config: Option, +) -> Result<()> { let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() { // for VMs use pgbouncer specific way to connect to // pgbouncer admin console without password @@ -473,19 +478,21 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result } }; - // Apply new config - for (option_name, value) in pgbouncer_config.iter() { - let query = format!("SET {}={}", option_name, value); - // keep this log line for debugging purposes - info!("Applying pgbouncer setting change: {}", query); + if let Some(tls_config) = tls_config { + // pgbouncer starts in a half-ok state if it cannot find these files. + // It will default to client_tls_sslmode=deny, which causes proxy to error. + // There is a small window at startup where these files don't yet exist in the VM. + // Best to wait until it exists. + loop { + if let Ok(true) = tokio::fs::try_exists(&tls_config.key_path).await { + break; + } + tokio::time::sleep(Duration::from_millis(500)).await + } - if let Err(err) = client.simple_query(&query).await { - // Don't fail on error, just print it into log - error!( - "Failed to apply pgbouncer setting change: {}, {}", - query, err - ); - }; + pgbouncer_config.insert("client_tls_cert_file".to_string(), tls_config.cert_path); + pgbouncer_config.insert("client_tls_key_file".to_string(), tls_config.key_path); + pgbouncer_config.insert("client_tls_sslmode".to_string(), "allow".to_string()); } // save values to pgbouncer.ini @@ -501,6 +508,13 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result }; update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?; + info!("Applying pgbouncer setting change"); + + if let Err(err) = client.simple_query("RELOAD").await { + // Don't fail on error, just print it into log + error!("Failed to apply pgbouncer setting change, {err}",); + }; + Ok(()) } diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index c8fba4fdcd..80594db3f1 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -1,8 +1,14 @@ +use std::fs; +use std::io::ErrorKind; +use std::path::Path; use std::process::Command; +use std::time::Duration; use std::{fs::OpenOptions, io::Write}; -use anyhow::{Context, Result}; -use tracing::info; +use anyhow::{Context, Result, anyhow}; +use tracing::{error, info, instrument, warn}; + +const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf"; fn get_rsyslog_pid() -> Option { let output = Command::new("pgrep") @@ -43,7 +49,7 @@ fn restart_rsyslog() -> Result<()> { } pub fn configure_audit_rsyslog( - log_directory: &str, + log_directory: String, tag: &str, remote_endpoint: &str, ) -> Result<()> { @@ -75,3 +81,196 @@ pub fn configure_audit_rsyslog( Ok(()) } + +/// Configuration for enabling Postgres logs forwarding from rsyslogd +pub struct PostgresLogsRsyslogConfig<'a> { + pub host: Option<&'a str>, +} + +impl<'a> PostgresLogsRsyslogConfig<'a> { + pub fn new(host: Option<&'a str>) -> Self { + Self { host } + } + + pub fn build(&self) -> Result { + match self.host { + Some(host) => { + if let Some((target, port)) = host.split_once(":") { + Ok(format!( + include_str!( + "config_template/compute_rsyslog_postgres_export_template.conf" + ), + logs_export_target = target, + logs_export_port = port, + )) + } else { + Err(anyhow!("Invalid host format for Postgres logs export")) + } + } + None => Ok("".to_string()), + } + } + + fn current_config() -> Result { + let config_content = match std::fs::read_to_string(POSTGRES_LOGS_CONF_PATH) { + Ok(c) => c, + Err(err) if err.kind() == ErrorKind::NotFound => String::new(), + Err(err) => return Err(err.into()), + }; + Ok(config_content) + } + + /// Returns the default host for otel collector that receives Postgres logs + pub fn default_host(project_id: &str) -> String { + format!( + "config-{}-collector.neon-telemetry.svc.cluster.local:10514", + project_id + ) + } +} + +pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> { + let new_config = conf.build()?; + let current_config = PostgresLogsRsyslogConfig::current_config()?; + + if new_config == current_config { + info!("postgres logs rsyslog configuration is up-to-date"); + return Ok(()); + } + + // When new config is empty we can simply remove the configuration file. + if new_config.is_empty() { + info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH); + match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) { + Ok(_) => {} + Err(err) if err.kind() == ErrorKind::NotFound => {} + Err(err) => return Err(err.into()), + } + restart_rsyslog()?; + return Ok(()); + } + + info!( + "configuring rsyslog for postgres logs export to: {:?}", + conf.host + ); + + let mut file = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(POSTGRES_LOGS_CONF_PATH)?; + file.write_all(new_config.as_bytes())?; + + info!( + "rsyslog configuration file {} added successfully. Starting rsyslogd", + POSTGRES_LOGS_CONF_PATH + ); + + restart_rsyslog()?; + Ok(()) +} + +#[instrument(skip_all)] +async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> { + info!("running pgaudit GC main loop"); + loop { + // Check log_directory for old pgaudit logs and delete them. + // New log files are checked every 5 minutes, as set in pgaudit.log_rotation_age + // Find files that were not modified in the last 15 minutes and delete them. + // This should be enough time for rsyslog to process the logs and for us to catch the alerts. + // + // In case of a very high load, we might need to adjust this value and pgaudit.log_rotation_age. + // + // TODO: add some smarter logic to delete the files that are fully streamed according to rsyslog + // imfile-state files, but for now just do a simple GC to avoid filling up the disk. + let _ = Command::new("find") + .arg(&log_directory) + .arg("-name") + .arg("audit*.log") + .arg("-mmin") + .arg("+15") + .arg("-delete") + .output()?; + + // also collect the metric for the size of the log directory + async fn get_log_files_size(path: &Path) -> Result { + let mut total_size = 0; + + for entry in fs::read_dir(path)? { + let entry = entry?; + let entry_path = entry.path(); + + if entry_path.is_file() && entry_path.to_string_lossy().ends_with("log") { + total_size += entry.metadata()?.len(); + } + } + + Ok(total_size) + } + + let log_directory_size = get_log_files_size(Path::new(&log_directory)) + .await + .unwrap_or_else(|e| { + warn!("Failed to get log directory size: {}", e); + 0 + }); + crate::metrics::AUDIT_LOG_DIR_SIZE.set(log_directory_size as f64); + tokio::time::sleep(Duration::from_secs(60)).await; + } +} + +// launch pgaudit GC thread to clean up the old pgaudit logs stored in the log_directory +pub fn launch_pgaudit_gc(log_directory: String) { + tokio::spawn(async move { + if let Err(e) = pgaudit_gc_main_loop(log_directory).await { + error!("pgaudit GC main loop failed: {}", e); + } + }); +} + +#[cfg(test)] +mod tests { + use crate::rsyslog::PostgresLogsRsyslogConfig; + + #[test] + fn test_postgres_logs_config() { + { + // Verify empty config + let conf = PostgresLogsRsyslogConfig::new(None); + let res = conf.build(); + assert!(res.is_ok()); + let conf_str = res.unwrap(); + assert_eq!(&conf_str, ""); + } + + { + // Verify config + let conf = PostgresLogsRsyslogConfig::new(Some("collector.cvc.local:514")); + let res = conf.build(); + assert!(res.is_ok()); + let conf_str = res.unwrap(); + assert!(conf_str.contains("omfwd")); + assert!(conf_str.contains(r#"target="collector.cvc.local""#)); + assert!(conf_str.contains(r#"port="514""#)); + } + + { + // Verify invalid config + let conf = PostgresLogsRsyslogConfig::new(Some("invalid")); + let res = conf.build(); + assert!(res.is_err()); + } + + { + // Verify config with default host + let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123"); + let conf = PostgresLogsRsyslogConfig::new(Some(&host)); + let res = conf.build(); + assert!(res.is_ok()); + let conf_str = res.unwrap(); + assert!(conf_str.contains(r#"shy-breeze-123"#)); + assert!(conf_str.contains(r#"port="10514""#)); + } + } +} diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 1d19f2738d..a76af21e9f 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -8,13 +8,12 @@ use compute_api::responses::{ use compute_api::spec::ComputeSpec; use reqwest::StatusCode; use tokio_postgres::Client; -use tracing::{error, info, instrument, warn}; +use tracing::{error, info, instrument}; use crate::config; use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS}; use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; -use crate::pg_helpers::*; // Do control plane request and return response if any. In case of error it // returns a bool flag indicating whether it makes sense to retry the request @@ -212,122 +211,3 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> { Ok(()) } - -/// Connect to the database as superuser and pre-create anon extension -/// if it is present in shared_preload_libraries -#[instrument(skip_all)] -pub async fn handle_extension_anon( - spec: &ComputeSpec, - db_owner: &str, - db_client: &mut Client, - grants_only: bool, -) -> Result<()> { - info!("handle extension anon"); - - if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { - if libs.contains("anon") { - if !grants_only { - // check if extension is already initialized using anon.is_initialized() - let query = "SELECT anon.is_initialized()"; - match db_client.query(query, &[]).await { - Ok(rows) => { - if !rows.is_empty() { - let is_initialized: bool = rows[0].get(0); - if is_initialized { - info!("anon extension is already initialized"); - return Ok(()); - } - } - } - Err(e) => { - warn!( - "anon extension is_installed check failed with expected error: {}", - e - ); - } - }; - - // Create anon extension if this compute needs it - // Users cannot create it themselves, because superuser is required. - let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE"; - info!("creating anon extension with query: {}", query); - match db_client.query(query, &[]).await { - Ok(_) => {} - Err(e) => { - error!("anon extension creation failed with error: {}", e); - return Ok(()); - } - } - - // check that extension is installed - query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; - let rows = db_client.query(query, &[]).await?; - if rows.is_empty() { - error!("anon extension is not installed"); - return Ok(()); - } - - // Initialize anon extension - // This also requires superuser privileges, so users cannot do it themselves. - query = "SELECT anon.init()"; - match db_client.query(query, &[]).await { - Ok(_) => {} - Err(e) => { - error!("anon.init() failed with error: {}", e); - return Ok(()); - } - } - } - - // check that extension is installed, if not bail early - let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; - match db_client.query(query, &[]).await { - Ok(rows) => { - if rows.is_empty() { - error!("anon extension is not installed"); - return Ok(()); - } - } - Err(e) => { - error!("anon extension check failed with error: {}", e); - return Ok(()); - } - }; - - let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner); - info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query).await?; - - // Grant permissions to db_owner to use anon extension functions - let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner); - info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query).await?; - - // This is needed, because some functions are defined as SECURITY DEFINER. - // In Postgres SECURITY DEFINER functions are executed with the privileges - // of the owner. - // In anon extension this it is needed to access some GUCs, which are only accessible to - // superuser. But we've patched postgres to allow db_owner to access them as well. - // So we need to change owner of these functions to db_owner. - let query = format!(" - SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};' - from pg_proc p - join pg_namespace nsp ON p.pronamespace = nsp.oid - where nsp.nspname = 'anon';", db_owner); - - info!("change anon extension functions owner to db owner"); - db_client.simple_query(&query).await?; - - // affects views as well - let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner); - info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query).await?; - - let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner); - info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query).await?; - } - } - - Ok(()) -} diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index e5f7aebbf8..80506b13cb 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -6,7 +6,7 @@ use std::sync::Arc; use anyhow::{Context, Result}; use compute_api::responses::ComputeStatus; -use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role}; +use compute_api::spec::{ComputeAudit, ComputeSpec, Database, PgIdent, Role}; use futures::future::join_all; use tokio::sync::RwLock; use tokio_postgres::Client; @@ -26,7 +26,7 @@ use crate::spec_apply::ApplySpecPhase::{ RunInEachDatabase, }; use crate::spec_apply::PerDatabasePhase::{ - ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension, + ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, }; impl ComputeNode { @@ -238,7 +238,6 @@ impl ComputeNode { let mut phases = vec![ DeleteDBRoleReferences, ChangeSchemaPerms, - HandleAnonExtension, ]; if spec.drop_subscriptions_before_start && !drop_subscriptions_done { @@ -458,7 +457,6 @@ impl Debug for DB { pub enum PerDatabasePhase { DeleteDBRoleReferences, ChangeSchemaPerms, - HandleAnonExtension, /// This is a shared phase, used for both i) dropping dangling LR subscriptions /// before dropping the DB, and ii) dropping all subscriptions after creating /// a fresh branch. @@ -1012,98 +1010,6 @@ async fn get_operations<'a>( ] .into_iter(); - Ok(Box::new(operations)) - } - // TODO: remove this completely https://github.com/neondatabase/cloud/issues/22663 - PerDatabasePhase::HandleAnonExtension => { - // Only install Anon into user databases - let db = match &db { - DB::SystemDB => return Ok(Box::new(empty())), - DB::UserDB(db) => db, - }; - // Never install Anon when it's not enabled as feature - if !spec.features.contains(&ComputeFeature::AnonExtension) { - return Ok(Box::new(empty())); - } - - // Only install Anon when it's added in preload libraries - let opt_libs = spec.cluster.settings.find("shared_preload_libraries"); - - let libs = match opt_libs { - Some(libs) => libs, - None => return Ok(Box::new(empty())), - }; - - if !libs.contains("anon") { - return Ok(Box::new(empty())); - } - - let db_owner = db.owner.pg_quote(); - - let operations = vec![ - // Create anon extension if this compute needs it - // Users cannot create it themselves, because superuser is required. - Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS anon CASCADE"), - comment: Some(String::from("creating anon extension")), - }, - // Initialize anon extension - // This also requires superuser privileges, so users cannot do it themselves. - Operation { - query: String::from("SELECT anon.init()"), - comment: Some(String::from("initializing anon extension data")), - }, - Operation { - query: format!("GRANT ALL ON SCHEMA anon TO {}", db_owner), - comment: Some(String::from( - "granting anon extension schema permissions", - )), - }, - Operation { - query: format!( - "GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", - db_owner - ), - comment: Some(String::from( - "granting anon extension schema functions permissions", - )), - }, - // We need this, because some functions are defined as SECURITY DEFINER. - // In Postgres SECURITY DEFINER functions are executed with the privileges - // of the owner. - // In anon extension this it is needed to access some GUCs, which are only accessible to - // superuser. But we've patched postgres to allow db_owner to access them as well. - // So we need to change owner of these functions to db_owner. - Operation { - query: format!( - include_str!("sql/anon_ext_fn_reassign.sql"), - db_owner = db_owner, - ), - comment: Some(String::from( - "change anon extension functions owner to database_owner", - )), - }, - Operation { - query: format!( - "GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", - db_owner, - ), - comment: Some(String::from( - "granting anon extension tables permissions", - )), - }, - Operation { - query: format!( - "GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", - db_owner, - ), - comment: Some(String::from( - "granting anon extension sequences permissions", - )), - }, - ] - .into_iter(); - Ok(Box::new(operations)) } } diff --git a/compute_tools/src/tls.rs b/compute_tools/src/tls.rs new file mode 100644 index 0000000000..8f465c7300 --- /dev/null +++ b/compute_tools/src/tls.rs @@ -0,0 +1,117 @@ +use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration}; + +use anyhow::{Context, Result, bail}; +use compute_api::responses::TlsConfig; +use ring::digest; +use spki::der::{Decode, PemReader}; +use x509_cert::Certificate; + +#[derive(Clone, Copy)] +pub struct CertDigest(digest::Digest); + +pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver { + let mut digest = compute_digest(&cert_path).await; + let (tx, rx) = tokio::sync::watch::channel(digest); + tokio::spawn(async move { + while !tx.is_closed() { + let new_digest = compute_digest(&cert_path).await; + if digest.0.as_ref() != new_digest.0.as_ref() { + digest = new_digest; + _ = tx.send(digest); + } + + tokio::time::sleep(Duration::from_secs(60)).await + } + }); + rx +} + +async fn compute_digest(cert_path: &str) -> CertDigest { + loop { + match try_compute_digest(cert_path).await { + Ok(d) => break d, + Err(e) => { + tracing::error!("could not read cert file {e:?}"); + tokio::time::sleep(Duration::from_secs(1)).await + } + } + } +} + +async fn try_compute_digest(cert_path: &str) -> Result { + let data = tokio::fs::read(cert_path).await?; + // sha256 is extremely collision resistent. can safely assume the digest to be unique + Ok(CertDigest(digest::digest(&digest::SHA256, &data))) +} + +pub const SERVER_CRT: &str = "server.crt"; +pub const SERVER_KEY: &str = "server.key"; + +pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) { + loop { + match try_update_key_path_blocking(pg_data, tls_config) { + Ok(()) => break, + Err(e) => { + tracing::error!("could not create key file {e:?}"); + std::thread::sleep(Duration::from_secs(1)) + } + } + } +} + +// Postgres requires the keypath be "secure". This means +// 1. Owned by the postgres user. +// 2. Have permission 600. +fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> { + let key = std::fs::read_to_string(&tls_config.key_path)?; + let crt = std::fs::read_to_string(&tls_config.cert_path)?; + + // to mitigate a race condition during renewal. + verify_key_cert(&key, &crt)?; + + let mut key_file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .mode(0o600) + .open(pg_data.join(SERVER_KEY))?; + + let mut crt_file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .mode(0o600) + .open(pg_data.join(SERVER_CRT))?; + + key_file.write_all(key.as_bytes())?; + crt_file.write_all(crt.as_bytes())?; + + Ok(()) +} + +fn verify_key_cert(key: &str, cert: &str) -> Result<()> { + use x509_cert::der::oid::db::rfc5912::ECDSA_WITH_SHA_256; + + let cert = Certificate::decode(&mut PemReader::new(cert.as_bytes()).context("pem reader")?) + .context("decode cert")?; + + match cert.signature_algorithm.oid { + ECDSA_WITH_SHA_256 => { + let key = p256::SecretKey::from_sec1_pem(key).context("parse key")?; + + let a = key.public_key().to_sec1_bytes(); + let b = cert + .tbs_certificate + .subject_public_key_info + .subject_public_key + .raw_bytes(); + + if *a != *b { + bail!("private key file does not match certificate") + } + } + _ => bail!("unknown TLS key type"), + } + + Ok(()) +} diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index f2d74ff384..b72c1293ee 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -64,7 +64,8 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor #[test] fn ident_pg_quote_dollar() { let test_cases = vec![ - ("name", ("$$name$$", "x")), + ("name", ("$x$name$x$", "xx")), + ("name$", ("$x$name$$x$", "xx")), ("name$$", ("$x$name$$$x$", "xx")), ("name$$$", ("$x$name$$$$x$", "xx")), ("name$$$$", ("$x$name$$$$$x$", "xx")), diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index ba1411b615..747268f80b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -36,7 +36,9 @@ use pageserver_api::config::{ use pageserver_api::controller_api::{ NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest, }; -use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{ + ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, +}; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; @@ -977,7 +979,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { neon_distrib_dir: None, default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), storage_controller: None, - control_plane_compute_hook_api: None, + control_plane_hooks_api: None, generate_local_ssl_certs: false, } }; @@ -1129,12 +1131,16 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any let tenant_id = get_tenant_id(args.tenant_id, env)?; let tenant_conf: HashMap<_, _> = args.config.iter().flat_map(|c| c.split_once(':')).collect(); + let config = PageServerNode::parse_config(tenant_conf)?; - pageserver - .tenant_config(tenant_id, tenant_conf) + let req = TenantConfigRequest { tenant_id, config }; + + let storage_controller = StorageController::from_env(env); + storage_controller + .set_tenant_config(&req) .await .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; - println!("tenant {tenant_id} successfully configured on the pageserver"); + println!("tenant {tenant_id} successfully configured via storcon"); } } Ok(()) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index ec9eb74e6f..f0a11106bd 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -72,9 +72,9 @@ pub struct LocalEnv { // be propagated into each pageserver's configuration. pub control_plane_api: Url, - // Control plane upcall API for storage controller. If set, this will be propagated into the + // Control plane upcall APIs for storage controller. If set, this will be propagated into the // storage controller's configuration. - pub control_plane_compute_hook_api: Option, + pub control_plane_hooks_api: Option, /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. // A `HashMap>` would be more appropriate here, @@ -104,6 +104,7 @@ pub struct OnDiskConfig { pub pageservers: Vec, pub safekeepers: Vec, pub control_plane_api: Option, + pub control_plane_hooks_api: Option, pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, // Note: skip serializing because in compat tests old storage controller fails @@ -136,7 +137,7 @@ pub struct NeonLocalInitConf { pub pageservers: Vec, pub safekeepers: Vec, pub control_plane_api: Option, - pub control_plane_compute_hook_api: Option>, + pub control_plane_hooks_api: Option, pub generate_local_ssl_certs: bool, } @@ -148,7 +149,7 @@ pub struct NeonBroker { pub listen_addr: SocketAddr, } -/// Broker config for cluster internal communication. +/// A part of storage controller's config the neon_local knows about. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct NeonStorageControllerConf { @@ -175,10 +176,11 @@ pub struct NeonStorageControllerConf { #[serde(with = "humantime_serde")] pub long_reconcile_threshold: Option, - #[serde(default)] pub use_https_pageserver_api: bool, pub timelines_onto_safekeepers: bool, + + pub use_https_safekeeper_api: bool, } impl NeonStorageControllerConf { @@ -204,6 +206,7 @@ impl Default for NeonStorageControllerConf { long_reconcile_threshold: None, use_https_pageserver_api: false, timelines_onto_safekeepers: false, + use_https_safekeeper_api: false, } } } @@ -301,6 +304,7 @@ pub struct SafekeeperConf { pub pg_port: u16, pub pg_tenant_only_port: Option, pub http_port: u16, + pub https_port: Option, pub sync: bool, pub remote_storage: Option, pub backup_threads: Option, @@ -315,6 +319,7 @@ impl Default for SafekeeperConf { pg_port: 0, pg_tenant_only_port: None, http_port: 0, + https_port: None, sync: true, remote_storage: None, backup_threads: None, @@ -573,7 +578,8 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api, - control_plane_compute_hook_api, + control_plane_hooks_api, + control_plane_compute_hook_api: _, branch_name_mappings, generate_local_ssl_certs, } = on_disk_config; @@ -588,7 +594,7 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api: control_plane_api.unwrap(), - control_plane_compute_hook_api, + control_plane_hooks_api, branch_name_mappings, generate_local_ssl_certs, } @@ -695,7 +701,8 @@ impl LocalEnv { pageservers: vec![], // it's skip_serializing anyway safekeepers: self.safekeepers.clone(), control_plane_api: Some(self.control_plane_api.clone()), - control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), + control_plane_hooks_api: self.control_plane_hooks_api.clone(), + control_plane_compute_hook_api: None, branch_name_mappings: self.branch_name_mappings.clone(), generate_local_ssl_certs: self.generate_local_ssl_certs, }, @@ -779,8 +786,8 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api, - control_plane_compute_hook_api, generate_local_ssl_certs, + control_plane_hooks_api, } = conf; // Find postgres binaries. @@ -827,7 +834,7 @@ impl LocalEnv { pageservers: pageservers.iter().map(Into::into).collect(), safekeepers, control_plane_api: control_plane_api.unwrap(), - control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), + control_plane_hooks_api, branch_name_mappings: Default::default(), generate_local_ssl_certs, }; @@ -842,6 +849,9 @@ impl LocalEnv { // create safekeeper dirs for safekeeper in &env.safekeepers { fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?; + SafekeeperNode::from_env(&env, safekeeper) + .initialize() + .context("safekeeper init failed")?; } // initialize pageserver state diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 70915d5aaf..231871852e 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -111,6 +111,18 @@ impl SafekeeperNode { .expect("non-Unicode path") } + /// Initializes a safekeeper node by creating all necessary files, + /// e.g. SSL certificates. + pub fn initialize(&self) -> anyhow::Result<()> { + if self.env.generate_local_ssl_certs { + self.env.generate_ssl_cert( + &self.datadir_path().join("server.crt"), + &self.datadir_path().join("server.key"), + )?; + } + Ok(()) + } + pub async fn start( &self, extra_opts: &[String], @@ -196,6 +208,16 @@ impl SafekeeperNode { ]); } + if let Some(https_port) = self.conf.https_port { + args.extend([ + "--listen-https".to_owned(), + format!("{}:{}", self.listen_addr, https_port), + ]); + } + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { + args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); + } + args.extend_from_slice(extra_opts); background_process::start_process( diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 439d7936a7..0c78f2e18e 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -14,7 +14,7 @@ use pageserver_api::controller_api::{ NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; -use pageserver_api::models::{TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; @@ -538,6 +538,10 @@ impl StorageController { args.push("--use-https-pageserver-api".to_string()); } + if self.config.use_https_safekeeper_api { + args.push("--use-https-safekeeper-api".to_string()); + } + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); } @@ -558,10 +562,8 @@ impl StorageController { args.push(format!("--public-key=\"{public_key}\"")); } - if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api { - args.push(format!( - "--compute-hook-url={control_plane_compute_hook_api}" - )); + if let Some(control_plane_hooks_api) = &self.env.control_plane_hooks_api { + args.push(format!("--control-plane-url={control_plane_hooks_api}")); } if let Some(split_threshold) = self.config.split_threshold.as_ref() { @@ -878,4 +880,9 @@ impl StorageController { ) .await } + + pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> anyhow::Result<()> { + self.dispatch(Method::PUT, "v1/tenant/config".to_string(), Some(req)) + .await + } } diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index b5c4f21e97..ae4bf9a519 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -14,8 +14,8 @@ use pageserver_api::controller_api::{ TenantShardMigrateRequest, TenantShardMigrateResponse, }; use pageserver_api::models::{ - EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters, - TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest, + EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig, + TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest, TenantShardSplitResponse, }; use pageserver_api::shard::{ShardStripeSize, TenantShardId}; @@ -158,12 +158,6 @@ enum Command { #[arg(long)] tenant_id: TenantId, }, - /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary - /// mode so that it can warm up content on a pageserver. - TenantWarmup { - #[arg(long)] - tenant_id: TenantId, - }, TenantSetPreferredAz { #[arg(long)] tenant_id: TenantId, @@ -871,94 +865,6 @@ async fn main() -> anyhow::Result<()> { ) .await?; } - Command::TenantWarmup { tenant_id } => { - let describe_response = storcon_client - .dispatch::<(), TenantDescribeResponse>( - Method::GET, - format!("control/v1/tenant/{tenant_id}"), - None, - ) - .await; - match describe_response { - Ok(describe) => { - if matches!(describe.policy, PlacementPolicy::Secondary) { - // Fine: it's already known to controller in secondary mode: calling - // again to put it into secondary mode won't cause problems. - } else { - anyhow::bail!("Tenant already present with policy {:?}", describe.policy); - } - } - Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => { - // Fine: this tenant isn't know to the storage controller yet. - } - Err(e) => { - // Unexpected API error - return Err(e.into()); - } - } - - vps_client - .location_config( - TenantShardId::unsharded(tenant_id), - pageserver_api::models::LocationConfig { - mode: pageserver_api::models::LocationConfigMode::Secondary, - generation: None, - secondary_conf: Some(LocationConfigSecondary { warm: true }), - shard_number: 0, - shard_count: 0, - shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0, - tenant_conf: TenantConfig::default(), - }, - None, - true, - ) - .await?; - - let describe_response = storcon_client - .dispatch::<(), TenantDescribeResponse>( - Method::GET, - format!("control/v1/tenant/{tenant_id}"), - None, - ) - .await?; - - let secondary_ps_id = describe_response - .shards - .first() - .unwrap() - .node_secondary - .first() - .unwrap(); - - println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}"); - loop { - let (status, progress) = vps_client - .tenant_secondary_download( - TenantShardId::unsharded(tenant_id), - Some(Duration::from_secs(10)), - ) - .await?; - println!( - "Progress: {}/{} layers, {}/{} bytes", - progress.layers_downloaded, - progress.layers_total, - progress.bytes_downloaded, - progress.bytes_total - ); - match status { - StatusCode::OK => { - println!("Download complete"); - break; - } - StatusCode::ACCEPTED => { - // Loop - } - _ => { - anyhow::bail!("Unexpected download status: {status}"); - } - } - } - } Command::TenantDrop { tenant_id, unclean } => { if !unclean { anyhow::bail!( diff --git a/deny.toml b/deny.toml index 1023b1833a..ed7aa9ef9f 100644 --- a/deny.toml +++ b/deny.toml @@ -31,10 +31,6 @@ reason = "the marvin attack only affects private key decryption, not public key id = "RUSTSEC-2024-0436" reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact." -[[advisories.ignore]] -id = "RUSTSEC-2025-0014" -reason = "The humantime is widely used and is not easy to replace right now. It is unmaintained, but it has no known vulnerabilities to care about. #11179" - # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index b5f0f47ceb..9ef831a9cd 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -1,4 +1,4 @@ -ARG REPOSITORY=neondatabase +ARG REPOSITORY=ghcr.io/neondatabase ARG COMPUTE_IMAGE=compute-node-v14 ARG TAG=latest diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 95d4ff7b2a..493a0a5523 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -29,7 +29,7 @@ services: pageserver: restart: always - image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password @@ -45,7 +45,7 @@ services: safekeeper1: restart: always - image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454 - SAFEKEEPER_ID=1 @@ -75,7 +75,7 @@ services: safekeeper2: restart: always - image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454 - SAFEKEEPER_ID=2 @@ -105,7 +105,7 @@ services: safekeeper3: restart: always - image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454 - SAFEKEEPER_ID=3 @@ -135,7 +135,7 @@ services: storage_broker: restart: always - image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} ports: - 50051:50051 command: @@ -147,7 +147,7 @@ services: build: context: ./compute_wrapper/ args: - - REPOSITORY=${REPOSITORY:-neondatabase} + - REPOSITORY=${REPOSITORY:-ghcr.io/neondatabase} - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16} - TAG=${COMPUTE_TAG:-${TAG:-latest}} - http_proxy=${http_proxy:-} @@ -186,7 +186,7 @@ services: neon-test-extensions: profiles: ["test-extensions"] - image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} environment: - PGPASSWORD=cloud_admin entrypoint: diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh index 72ae61b032..3117950cc0 100644 --- a/docker-compose/run-tests.sh +++ b/docker-compose/run-tests.sh @@ -20,4 +20,4 @@ for d in ${LIST}; do done [ -z "${FAILED}" ] && exit 0 echo "${FAILED}" -exit 1 \ No newline at end of file +exit 1 diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/001-cluster-size-limits.md similarity index 100% rename from docs/rfcs/cluster-size-limits.md rename to docs/rfcs/001-cluster-size-limits.md diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index f7b0b3a587..094f8d5360 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -1,3 +1,7 @@ +# Neon RFCs + +## Overview + This directory contains Request for Comments documents, or RFCs, for features or concepts that have been proposed. Alternative names: technical design doc, ERD, one-pager @@ -59,37 +63,10 @@ RFC lifecycle: ### RFC template +Use template with `YYYY-MM-DD-copy-me.md` as a starting point. Timestamp prefix helps to avoid awkward 'id' collisions. + +```sh +cp docs/rfcs/YYYY-MM-DD-copy-me.md docs/rfcs/$(date +"%Y-%m-%d")-.md +``` + Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration. - -``` -# Name -Created on .. -Implemented on .. - -## Summary - -## Motivation - -## Non Goals (if relevant) - -## Impacted components (e.g. pageserver, safekeeper, console, etc) - -## Proposed implementation - -### Reliability, failure modes and corner cases (if relevant) - -### Interaction/Sequence diagram (if relevant) - -### Scalability (if relevant) - -### Security implications (if relevant) - -### Unresolved questions (if relevant) - -## Alternative implementation (if relevant) - -## Pros/cons of proposed approaches (if relevant) - -## Definition of Done (if relevant) - -``` diff --git a/docs/rfcs/YYYY-MM-DD-copy-me.md b/docs/rfcs/YYYY-MM-DD-copy-me.md new file mode 100644 index 0000000000..8487861e6b --- /dev/null +++ b/docs/rfcs/YYYY-MM-DD-copy-me.md @@ -0,0 +1,30 @@ +# Name + +Created on YYYY-MM-DD +Implemented on _TBD_ + +## Summary + +## Motivation + +## Non Goals (if relevant) + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +## Proposed implementation + +### Reliability, failure modes and corner cases (if relevant) + +### Interaction/Sequence diagram (if relevant) + +### Scalability (if relevant) + +### Security implications (if relevant) + +### Unresolved questions (if relevant) + +## Alternative implementation (if relevant) + +## Pros/cons of proposed approaches (if relevant) + +## Definition of Done (if relevant) diff --git a/docs/storage_controller.md b/docs/storage_controller.md index 6d2ef929a4..ac4aca4219 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -101,15 +101,25 @@ changes such as a pageserver node becoming unavailable, or the tenant's shard co postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver location changes. -The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires -JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request. +The hook is configured using the storage controller's `--control-plane-url` CLI option, from which the hook URL is computed. -In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems +Currently, there is two hooks, each computed by appending the name to the provided control plane URL prefix: + +- `notify-attach`, called whenever attachment for pageservers changes +- `notify-safekeepers`, called whenever attachment for safekeepers changes + +If the hooks require JWT auth, the token may be provided with `--control-plane-jwt-token`. +The hooks will be invoked with a `PUT` request. + +In the Neon cloud service, these hooks are implemented by Neon's internal cloud control plane. In `neon_local` systems, the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling the compute hook. -When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated: -the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience. +When implementing an on-premise Neon deployment, you must implement a service that handles the compute hooks. This is not complicated. + +### `notify-attach` body + +The `notify-attach` request body follows the format of the `ComputeHookNotifyRequest` structure, provided below for convenience. ``` struct ComputeHookNotifyRequestShard { @@ -128,15 +138,15 @@ When a notification is received: 1. Modify postgres configuration for this tenant: - - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The + - set `neon.pageserver_connstring` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The shards identified by `NodeId` must be converted to the address+port of the node. - - if stripe_size is not None, set `neon.stripe_size` to this value + - if stripe_size is not None, set `neon.shard_stripe_size` to this value 2. Send SIGHUP to postgres to reload configuration 3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller will retry the notification until it succeeds.. -### Example notification body +Example body: ``` { @@ -148,3 +158,34 @@ When a notification is received: ], } ``` + +### `notify-safekeepers` body + +The `notify-safekeepers` request body forllows the format of the `SafekeepersNotifyRequest` structure, provided below for convenience. + +``` +pub struct SafekeeperInfo { + pub id: NodeId, + pub hostname: String, +} + +pub struct SafekeepersNotifyRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub generation: u32, + pub safekeepers: Vec, +} +``` + +When a notification is received: + +1. Modify postgres configuration for this tenant: + + - set `neon.safekeeper_connstrings` to an array of postgres connection strings to safekeepers according to the `safekeepers` list. The + safekeepers identified by `NodeId` must be converted to the address+port of the respective safekeeper. + The hostname is provided for debugging purposes, so we reserve changes to how we pass it. + - set `neon.safekeepers_generation` to the provided `generation` value. + +2. Send SIGHUP to postgres to reload configuration +3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller + will retry the notification until it succeeds.. \ No newline at end of file diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index 0d1618c1b2..81b0cd19a1 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true chrono.workspace = true +indexmap.workspace = true jsonwebtoken.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 3fbdfcf83f..d88451c549 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -30,3 +30,9 @@ pub struct SetRoleGrantsRequest { pub privileges: Vec, pub role: PgIdent, } + +/// Request of the /configure_telemetry API +#[derive(Debug, Deserialize, Serialize)] +pub struct ConfigureTelemetryRequest { + pub logs_export_host: Option, +} diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 3300fbf7dd..c8f6019c5c 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -139,6 +139,7 @@ pub struct ComputeCtlConfig { /// Set of JSON web keys that the compute can use to authenticate /// communication from the control plane. pub jwks: JwkSet, + pub tls: Option, } impl Default for ComputeCtlConfig { @@ -147,10 +148,17 @@ impl Default for ComputeCtlConfig { jwks: JwkSet { keys: Vec::default(), }, + tls: None, } } } +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TlsConfig { + pub key_path: String, + pub cert_path: String, +} + /// Response of the `/computes/{compute_id}/spec` control-plane API. #[derive(Deserialize, Debug)] pub struct ControlPlaneSpecResponse { diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 77f2e1e631..11615b73a1 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -5,12 +5,15 @@ //! and connect it to the storage nodes. use std::collections::HashMap; +use indexmap::IndexMap; use regex::Regex; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use crate::responses::TlsConfig; + /// String type alias representing Postgres identifier and /// intended to be used for DB / role names. pub type PgIdent = String; @@ -125,7 +128,7 @@ pub struct ComputeSpec { // information about available remote extensions pub remote_extensions: Option, - pub pgbouncer_settings: Option>, + pub pgbouncer_settings: Option>, // Stripe size for pageserver sharding, in pages #[serde(default)] @@ -176,8 +179,8 @@ pub enum ComputeFeature { /// track short-lived connections as user activity. ActivityMonitorExperimental, - /// Pre-install and initialize anon extension for every database in the cluster - AnonExtension, + /// Allow to configure rsyslog for Postgres logs export + PostgresLogsExport, /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test @@ -357,6 +360,9 @@ pub struct LocalProxySpec { #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub jwks: Option>, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub tls: Option, } #[derive(Clone, Debug, Deserialize, Serialize)] diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index ccd015ad19..37de24be5b 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -208,7 +208,6 @@ ], "remote_extensions": { "library_index": { - "anon": "anon", "postgis-3": "postgis", "libpgrouting-3.4": "postgis", "postgis_raster-3": "postgis", @@ -217,12 +216,6 @@ "address_standardizer-3": "postgis" }, "extension_data": { - "anon": { - "archive_path": "5834329303/v15/extensions/anon.tar.zst", - "control_data": { - "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n" - } - }, "postgis": { "archive_path": "5834329303/v15/extensions/postgis.tar.zst", "control_data": { @@ -238,7 +231,6 @@ } }, "custom_extensions": [ - "anon" ], "public_extensions": [ "postgis" diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml index 00b3777a63..331ae4a9b8 100644 --- a/libs/http-utils/Cargo.toml +++ b/libs/http-utils/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true bytes.workspace = true +camino.workspace = true fail.workspace = true futures.workspace = true hyper0.workspace = true @@ -16,6 +17,7 @@ once_cell.workspace = true pprof.workspace = true regex.workspace = true routerify.workspace = true +rustls-pemfile.workspace = true serde.workspace = true serde_json.workspace = true serde_path_to_error.workspace = true diff --git a/libs/http-utils/src/lib.rs b/libs/http-utils/src/lib.rs index dd520ef69b..2bd0fe582f 100644 --- a/libs/http-utils/src/lib.rs +++ b/libs/http-utils/src/lib.rs @@ -4,6 +4,7 @@ pub mod failpoints; pub mod json; pub mod request; pub mod server; +pub mod tls_certs; extern crate hyper0 as hyper; diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs new file mode 100644 index 0000000000..db9ec825ed --- /dev/null +++ b/libs/http-utils/src/tls_certs.rs @@ -0,0 +1,21 @@ +use camino::Utf8Path; +use tokio_rustls::rustls::pki_types::{CertificateDer, PrivateKeyDer}; + +pub fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result>> { + let file = std::fs::File::open(filename)?; + let mut reader = std::io::BufReader::new(file); + + Ok(rustls_pemfile::certs(&mut reader).collect::, _>>()?) +} + +pub fn load_private_key(filename: &Utf8Path) -> anyhow::Result> { + let file = std::fs::File::open(filename)?; + let mut reader = std::io::BufReader::new(file); + + let key = rustls_pemfile::private_key(&mut reader)?; + + key.ok_or(anyhow::anyhow!( + "no private key found in {}", + filename.as_str(), + )) +} diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index ce7de1e0c7..b12ef65780 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -272,15 +272,16 @@ pub struct TenantConfigToml { /// size exceeds `compaction_upper_limit * checkpoint_distance`. pub compaction_upper_limit: usize, pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, - /// If true, compact down L0 across all tenant timelines before doing regular compaction. + /// If true, compact down L0 across all tenant timelines before doing regular compaction. L0 + /// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true. pub compaction_l0_first: bool, /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only - /// has an effect if `compaction_l0_first` is `true`. + /// has an effect if `compaction_l0_first` is true. Defaults to true. pub compaction_l0_semaphore: bool, - /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure, - /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer - /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification - /// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default. + /// Level0 delta layer threshold at which to delay layer flushes such that they take 2x as long, + /// and block on layer flushes during ephemeral layer rolls, for compaction backpressure. This + /// helps compaction keep up with WAL ingestion, and avoids read amplification blowing up. + /// Should be >compaction_threshold. 0 to disable. Defaults to 3x compaction_threshold. pub l0_flush_delay_threshold: Option, /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold /// to avoid deadlock. 0 to disable. Disabled by default. @@ -288,6 +289,8 @@ pub struct TenantConfigToml { /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next /// layer. This is a temporary backpressure mechanism which should be removed once /// l0_flush_{delay,stall}_threshold is fully enabled. + /// + /// TODO: this is no longer enabled, remove it when the config option is no longer set. pub l0_flush_wait_upload: bool, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. @@ -567,13 +570,15 @@ pub mod tenant_conf_defaults { // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So // with this config, we can get a maximum peak compaction usage of 9 GB. pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20; - pub const DEFAULT_COMPACTION_L0_FIRST: bool = false; + // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid + // read amp. + pub const DEFAULT_COMPACTION_L0_FIRST: bool = true; pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true; pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; - pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true; + pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; @@ -584,9 +589,8 @@ pub mod tenant_conf_defaults { pub const DEFAULT_GC_PERIOD: &str = "1 hr"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image - // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we - // want to enable this feature. - pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0; + // layer creation will end immediately. Set to 0 to disable. + pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3; pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index b1ebad83b1..4a8f75413c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -176,6 +176,39 @@ impl LsnLease { } } +/// Controls the detach ancestor behavior. +/// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point. +/// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all. +#[derive(Debug, Clone, Copy, Default)] +pub enum DetachBehavior { + #[default] + NoAncestorAndReparent, + MultiLevelAndNoReparent, +} + +impl std::str::FromStr for DetachBehavior { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s { + "no_ancestor_and_reparent" => Ok(DetachBehavior::NoAncestorAndReparent), + "multi_level_and_no_reparent" => Ok(DetachBehavior::MultiLevelAndNoReparent), + "v1" => Ok(DetachBehavior::NoAncestorAndReparent), + "v2" => Ok(DetachBehavior::MultiLevelAndNoReparent), + _ => Err("cannot parse detach behavior"), + } + } +} + +impl std::fmt::Display for DetachBehavior { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DetachBehavior::NoAncestorAndReparent => write!(f, "no_ancestor_and_reparent"), + DetachBehavior::MultiLevelAndNoReparent => write!(f, "multi_level_and_no_reparent"), + } + } +} + /// The only [`TenantState`] variants we could be `TenantState::Activating` from. /// /// XXX: We used to have more variants here, but now it's just one, which makes this rather @@ -1225,9 +1258,10 @@ pub struct TimelineInfo { pub last_record_lsn: Lsn, pub prev_record_lsn: Option, - /// Legacy field for compat with control plane. Synonym of `min_readable_lsn`. - /// TODO: remove once control plane no longer reads it. - pub latest_gc_cutoff_lsn: Lsn, + /// Legacy field, retained for one version to enable old storage controller to + /// decode (it was a mandatory field). + #[serde(default, rename = "latest_gc_cutoff_lsn")] + pub _unused: Lsn, /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients. /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead, diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index eca04b1f3d..8386d6e586 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -112,6 +112,16 @@ impl ShardIdentity { } } + /// An unsharded identity with the given stripe size (if non-zero). This is typically used to + /// carry over a stripe size for an unsharded tenant from persistent storage. + pub fn unsharded_with_stripe_size(stripe_size: ShardStripeSize) -> Self { + let mut shard_identity = Self::unsharded(); + if stripe_size.0 > 0 { + shard_identity.stripe_size = stripe_size; + } + shard_identity + } + /// A broken instance of this type is only used for `TenantState::Broken` tenants, /// which are constructed in code paths that don't have access to proper configuration. /// diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index 0ccd8c295f..b6bcabc922 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -135,8 +135,8 @@ impl Type { pub enum Kind { /// A simple type like `VARCHAR` or `INTEGER`. Simple, - /// An enumerated type along with its variants. - Enum(Vec), + /// An enumerated type. + Enum, /// A pseudo-type. Pseudo, /// An array type along with the type of its elements. @@ -146,9 +146,9 @@ pub enum Kind { /// A multirange type along with the type of its elements. Multirange(Type), /// A domain type along with its underlying type. - Domain(Type), - /// A composite type along with information about its fields. - Composite(Vec), + Domain(Oid), + /// A composite type. + Composite(Oid), } /// Information about a field of a composite type. diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 08a06163e1..186eb07000 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -19,10 +19,10 @@ use crate::config::{Host, SslMode}; use crate::connection::{Request, RequestMessages}; use crate::query::RowStream; use crate::simple_query::SimpleQueryStream; -use crate::types::{Oid, ToSql, Type}; +use crate::types::{Oid, Type}; use crate::{ - CancelToken, Error, ReadyForQueryStatus, Row, SimpleQueryMessage, Statement, Transaction, - TransactionBuilder, query, simple_query, slice_iter, + CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Statement, Transaction, + TransactionBuilder, query, simple_query, }; pub struct Responses { @@ -54,26 +54,18 @@ impl Responses { /// A cache of type info and prepared statements for fetching type info /// (corresponding to the queries in the [crate::prepare] module). #[derive(Default)] -struct CachedTypeInfo { +pub(crate) struct CachedTypeInfo { /// A statement for basic information for a type from its /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its /// fallback). - typeinfo: Option, - /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY). - typeinfo_composite: Option, - /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or - /// its fallback). - typeinfo_enum: Option, + pub(crate) typeinfo: Option, /// Cache of types already looked up. - types: HashMap, + pub(crate) types: HashMap, } pub struct InnerClient { sender: mpsc::UnboundedSender, - cached_typeinfo: Mutex, /// A buffer to use when writing out postgres commands. buffer: Mutex, @@ -91,38 +83,6 @@ impl InnerClient { }) } - pub fn typeinfo(&self) -> Option { - self.cached_typeinfo.lock().typeinfo.clone() - } - - pub fn set_typeinfo(&self, statement: &Statement) { - self.cached_typeinfo.lock().typeinfo = Some(statement.clone()); - } - - pub fn typeinfo_composite(&self) -> Option { - self.cached_typeinfo.lock().typeinfo_composite.clone() - } - - pub fn set_typeinfo_composite(&self, statement: &Statement) { - self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone()); - } - - pub fn typeinfo_enum(&self) -> Option { - self.cached_typeinfo.lock().typeinfo_enum.clone() - } - - pub fn set_typeinfo_enum(&self, statement: &Statement) { - self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone()); - } - - pub fn type_(&self, oid: Oid) -> Option { - self.cached_typeinfo.lock().types.get(&oid).cloned() - } - - pub fn set_type(&self, oid: Oid, type_: &Type) { - self.cached_typeinfo.lock().types.insert(oid, type_.clone()); - } - /// Call the given function with a buffer to be used when writing out /// postgres commands. pub fn with_buf(&self, f: F) -> R @@ -142,7 +102,6 @@ pub struct SocketConfig { pub host: Host, pub port: u16, pub connect_timeout: Option, - // pub keepalive: Option, } /// An asynchronous PostgreSQL client. @@ -151,6 +110,7 @@ pub struct SocketConfig { /// through this client object. pub struct Client { inner: Arc, + cached_typeinfo: CachedTypeInfo, socket_config: SocketConfig, ssl_mode: SslMode, @@ -169,9 +129,9 @@ impl Client { Client { inner: Arc::new(InnerClient { sender, - cached_typeinfo: Default::default(), buffer: Default::default(), }), + cached_typeinfo: Default::default(), socket_config, ssl_mode, @@ -189,55 +149,6 @@ impl Client { &self.inner } - /// Executes a statement, returning a vector of the resulting rows. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - pub async fn query( - &self, - statement: Statement, - params: &[&(dyn ToSql + Sync)], - ) -> Result, Error> { - self.query_raw(statement, slice_iter(params)) - .await? - .try_collect() - .await - } - - /// The maximally flexible version of [`query`]. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - /// - /// [`query`]: #method.query - pub async fn query_raw<'a, I>( - &self, - statement: Statement, - params: I, - ) -> Result - where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, - { - query::query(&self.inner, statement, params).await - } - /// Pass text directly to the Postgres backend to allow it to sort out typing itself and /// to save a roundtrip pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result @@ -284,14 +195,10 @@ impl Client { simple_query::batch_execute(self.inner(), query).await } - pub async fn discard_all(&self) -> Result { + pub async fn discard_all(&mut self) -> Result { // clear the prepared statements that are about to be nuked from the postgres session - { - let mut typeinfo = self.inner.cached_typeinfo.lock(); - typeinfo.typeinfo = None; - typeinfo.typeinfo_composite = None; - typeinfo.typeinfo_enum = None; - } + + self.cached_typeinfo.typeinfo = None; self.batch_execute("discard all").await } @@ -359,8 +266,8 @@ impl Client { } /// Query for type information - pub async fn get_type(&self, oid: Oid) -> Result { - crate::prepare::get_type(&self.inner, oid).await + pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result { + crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await } /// Determines if the connection to the server has already closed. diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 31c3d8fa3e..8e28843347 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -22,7 +22,7 @@ pub trait GenericClient: private::Sealed { I::IntoIter: ExactSizeIterator + Sync + Send; /// Query for type information - async fn get_type(&self, oid: Oid) -> Result; + async fn get_type(&mut self, oid: Oid) -> Result; } impl private::Sealed for Client {} @@ -38,8 +38,8 @@ impl GenericClient for Client { } /// Query for type information - async fn get_type(&self, oid: Oid) -> Result { - crate::prepare::get_type(self.inner(), oid).await + async fn get_type(&mut self, oid: Oid) -> Result { + self.get_type_inner(oid).await } } @@ -56,7 +56,7 @@ impl GenericClient for Transaction<'_> { } /// Query for type information - async fn get_type(&self, oid: Oid) -> Result { - self.client().get_type(oid).await + async fn get_type(&mut self, oid: Oid) -> Result { + self.client_mut().get_type(oid).await } } diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index b36d2e5f74..ba13a528f6 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -9,10 +9,10 @@ use log::debug; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; -use crate::client::InnerClient; +use crate::client::{CachedTypeInfo, InnerClient}; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; -use crate::types::{Field, Kind, Oid, Type}; +use crate::types::{Kind, Oid, Type}; use crate::{Column, Error, Statement, query, slice_iter}; pub(crate) const TYPEINFO_QUERY: &str = "\ @@ -23,23 +23,7 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid WHERE t.oid = $1 "; -const TYPEINFO_ENUM_QUERY: &str = "\ -SELECT enumlabel -FROM pg_catalog.pg_enum -WHERE enumtypid = $1 -ORDER BY enumsortorder -"; - -pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\ -SELECT attname, atttypid -FROM pg_catalog.pg_attribute -WHERE attrelid = $1 -AND NOT attisdropped -AND attnum > 0 -ORDER BY attnum -"; - -pub async fn prepare( +async fn prepare_typecheck( client: &Arc, name: &'static str, query: &str, @@ -67,7 +51,7 @@ pub async fn prepare( let mut parameters = vec![]; let mut it = parameter_description.parameters(); while let Some(oid) = it.next().map_err(Error::parse)? { - let type_ = get_type(client, oid).await?; + let type_ = Type::from_oid(oid).ok_or_else(Error::unexpected_message)?; parameters.push(type_); } @@ -75,7 +59,7 @@ pub async fn prepare( if let Some(row_description) = row_description { let mut it = row_description.fields(); while let Some(field) = it.next().map_err(Error::parse)? { - let type_ = get_type(client, field.type_oid()).await?; + let type_ = Type::from_oid(field.type_oid()).ok_or_else(Error::unexpected_message)?; let column = Column::new(field.name().to_string(), type_, field); columns.push(column); } @@ -84,15 +68,6 @@ pub async fn prepare( Ok(Statement::new(client, name, parameters, columns)) } -fn prepare_rec<'a>( - client: &'a Arc, - name: &'static str, - query: &'a str, - types: &'a [Type], -) -> Pin> + 'a + Send>> { - Box::pin(prepare(client, name, query, types)) -} - fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { if types.is_empty() { debug!("preparing query {}: {}", name, query); @@ -108,16 +83,20 @@ fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Resu }) } -pub async fn get_type(client: &Arc, oid: Oid) -> Result { +pub async fn get_type( + client: &Arc, + typecache: &mut CachedTypeInfo, + oid: Oid, +) -> Result { if let Some(type_) = Type::from_oid(oid) { return Ok(type_); } - if let Some(type_) = client.type_(oid) { - return Ok(type_); - } + if let Some(type_) = typecache.types.get(&oid) { + return Ok(type_.clone()); + }; - let stmt = typeinfo_statement(client).await?; + let stmt = typeinfo_statement(client, typecache).await?; let rows = query::query(client, stmt, slice_iter(&[&oid])).await?; pin_mut!(rows); @@ -136,100 +115,48 @@ pub async fn get_type(client: &Arc, oid: Oid) -> Result( client: &'a Arc, + typecache: &'a mut CachedTypeInfo, oid: Oid, ) -> Pin> + Send + 'a>> { - Box::pin(get_type(client, oid)) + Box::pin(get_type(client, typecache, oid)) } -async fn typeinfo_statement(client: &Arc) -> Result { - if let Some(stmt) = client.typeinfo() { - return Ok(stmt); +async fn typeinfo_statement( + client: &Arc, + typecache: &mut CachedTypeInfo, +) -> Result { + if let Some(stmt) = &typecache.typeinfo { + return Ok(stmt.clone()); } let typeinfo = "neon_proxy_typeinfo"; - let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?; + let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY, &[]).await?; - client.set_typeinfo(&stmt); - Ok(stmt) -} - -async fn get_enum_variants(client: &Arc, oid: Oid) -> Result, Error> { - let stmt = typeinfo_enum_statement(client).await?; - - query::query(client, stmt, slice_iter(&[&oid])) - .await? - .and_then(|row| async move { row.try_get(0) }) - .try_collect() - .await -} - -async fn typeinfo_enum_statement(client: &Arc) -> Result { - if let Some(stmt) = client.typeinfo_enum() { - return Ok(stmt); - } - - let typeinfo = "neon_proxy_typeinfo_enum"; - let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?; - - client.set_typeinfo_enum(&stmt); - Ok(stmt) -} - -async fn get_composite_fields(client: &Arc, oid: Oid) -> Result, Error> { - let stmt = typeinfo_composite_statement(client).await?; - - let rows = query::query(client, stmt, slice_iter(&[&oid])) - .await? - .try_collect::>() - .await?; - - let mut fields = vec![]; - for row in rows { - let name = row.try_get(0)?; - let oid = row.try_get(1)?; - let type_ = get_type_rec(client, oid).await?; - fields.push(Field::new(name, type_)); - } - - Ok(fields) -} - -async fn typeinfo_composite_statement(client: &Arc) -> Result { - if let Some(stmt) = client.typeinfo_composite() { - return Ok(stmt); - } - - let typeinfo = "neon_proxy_typeinfo_composite"; - let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?; - - client.set_typeinfo_composite(&stmt); + typecache.typeinfo = Some(stmt.clone()); Ok(stmt) } diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs index eecbfc5873..f32603470f 100644 --- a/libs/proxy/tokio-postgres2/src/transaction.rs +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -72,4 +72,9 @@ impl<'a> Transaction<'a> { pub fn client(&self) -> &Client { self.client } + + /// Returns a reference to the underlying `Client`. + pub fn client_mut(&mut self) -> &mut Client { + self.client + } } diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 10c703395f..33ff636a79 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -23,6 +23,7 @@ pub struct TimelineCreateRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub mconf: Configuration, + /// In the PG_VERSION_NUM macro format, like 140017. pub pg_version: u32, pub system_id: Option, // By default WAL_SEGMENT_SIZE @@ -221,6 +222,11 @@ pub struct TimelineMembershipSwitchResponse { pub current_conf: Configuration, } +#[derive(Clone, Copy, Serialize, Deserialize)] +pub struct TimelineDeleteResult { + pub dir_existed: bool, +} + fn lsn_invalid() -> Lsn { Lsn::INVALID } diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index 72f94d61e4..74992a7d03 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -21,7 +21,7 @@ //! .with_writer(std::io::stderr); //! //! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces -//! let otlp_layer = tracing_utils::init_tracing("my_application").await; +//! let otlp_layer = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default()).await; //! //! // Put it all together //! tracing_subscriber::registry() @@ -38,8 +38,12 @@ pub mod http; use opentelemetry::KeyValue; use opentelemetry::trace::TracerProvider; -use tracing::Subscriber; +use opentelemetry_otlp::WithExportConfig; +pub use opentelemetry_otlp::{ExportConfig, Protocol}; +use tracing::level_filters::LevelFilter; +use tracing::{Dispatch, Subscriber}; use tracing_subscriber::Layer; +use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::registry::LookupSpan; /// Set up OpenTelemetry exporter, using configuration from environment variables. @@ -69,19 +73,28 @@ use tracing_subscriber::registry::LookupSpan; /// /// This doesn't block, but is marked as 'async' to hint that this must be called in /// asynchronous execution context. -pub async fn init_tracing(service_name: &str) -> Option> +pub async fn init_tracing( + service_name: &str, + export_config: ExportConfig, +) -> Option> where S: Subscriber + for<'span> LookupSpan<'span>, { if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { return None; }; - Some(init_tracing_internal(service_name.to_string())) + Some(init_tracing_internal( + service_name.to_string(), + export_config, + )) } /// Like `init_tracing`, but creates a separate tokio Runtime for the tracing /// tasks. -pub fn init_tracing_without_runtime(service_name: &str) -> Option> +pub fn init_tracing_without_runtime( + service_name: &str, + export_config: ExportConfig, +) -> Option> where S: Subscriber + for<'span> LookupSpan<'span>, { @@ -112,16 +125,22 @@ where )); let _guard = runtime.enter(); - Some(init_tracing_internal(service_name.to_string())) + Some(init_tracing_internal( + service_name.to_string(), + export_config, + )) } -fn init_tracing_internal(service_name: String) -> impl Layer +fn init_tracing_internal(service_name: String, export_config: ExportConfig) -> impl Layer where S: Subscriber + for<'span> LookupSpan<'span>, { - // Sets up exporter from the OTEL_EXPORTER_* environment variables. + // Sets up exporter from the provided [`ExportConfig`] parameter. + // If the endpoint is not specified, it is loaded from the + // OTEL_EXPORTER_OTLP_ENDPOINT environment variable. let exporter = opentelemetry_otlp::SpanExporter::builder() .with_http() + .with_export_config(export_config) .build() .expect("could not initialize opentelemetry exporter"); @@ -151,3 +170,51 @@ where pub fn shutdown_tracing() { opentelemetry::global::shutdown_tracer_provider(); } + +pub enum OtelEnablement { + Disabled, + Enabled { + service_name: String, + export_config: ExportConfig, + runtime: &'static tokio::runtime::Runtime, + }, +} + +pub struct OtelGuard { + pub dispatch: Dispatch, +} + +impl Drop for OtelGuard { + fn drop(&mut self) { + shutdown_tracing(); + } +} + +/// Initializes OTEL infrastructure for performance tracing according to the provided configuration +/// +/// Performance tracing is handled by a different [`tracing::Subscriber`]. This functions returns +/// an [`OtelGuard`] containing a [`tracing::Dispatch`] associated with a newly created subscriber. +/// Applications should use this dispatch for their performance traces. +/// +/// The lifetime of the guard should match taht of the application. On drop, it tears down the +/// OTEL infra. +pub fn init_performance_tracing(otel_enablement: OtelEnablement) -> Option { + let otel_subscriber = match otel_enablement { + OtelEnablement::Disabled => None, + OtelEnablement::Enabled { + service_name, + export_config, + runtime, + } => { + let otel_layer = runtime + .block_on(init_tracing(&service_name, export_config)) + .with_filter(LevelFilter::INFO); + let otel_subscriber = tracing_subscriber::registry().with(otel_layer); + let otel_dispatch = Dispatch::new(otel_subscriber); + + Some(otel_dispatch) + } + }; + + otel_subscriber.map(|dispatch| OtelGuard { dispatch }) +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ac44300a51..4180602ac7 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -42,6 +42,7 @@ toml_edit = { workspace = true, features = ["serde"] } tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } +tracing-utils.workspace = true rand.workspace = true scopeguard.workspace = true strum.workspace = true diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 12c620ec87..35f3baaed1 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -49,7 +49,13 @@ pub fn bench_log_slow(c: &mut Criterion) { // performance too. Use a simple noop future that yields once, to avoid any scheduler fast // paths for a ready future. if enabled { - b.iter(|| runtime.block_on(log_slow("ready", THRESHOLD, tokio::task::yield_now()))); + b.iter(|| { + runtime.block_on(log_slow( + "ready", + THRESHOLD, + std::pin::pin!(tokio::task::yield_now()), + )) + }); } else { b.iter(|| runtime.block_on(tokio::task::yield_now())); } diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 881f1e765d..0ac8201795 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -165,6 +165,7 @@ pub fn init( }; log_layer.with_filter(rust_log_env_filter()) }); + let r = r.with( TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()), ); @@ -330,37 +331,90 @@ impl std::fmt::Debug for SecretString { /// /// TODO: consider upgrading this to a warning, but currently it fires too often. #[inline] -pub async fn log_slow(name: &str, threshold: Duration, f: impl Future) -> O { - // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and - // won't fit on the stack. - let mut f = Box::pin(f); +pub async fn log_slow(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O +where + F: Future, +{ + monitor_slow_future( + threshold, + threshold, // period = threshold + f, + |MonitorSlowFutureCallback { + ready, + is_slow, + elapsed_total, + elapsed_since_last_callback: _, + }| { + if !is_slow { + return; + } + if ready { + info!( + "slow {name} completed after {:.3}s", + elapsed_total.as_secs_f64() + ); + } else { + info!( + "slow {name} still running after {:.3}s", + elapsed_total.as_secs_f64() + ); + } + }, + ) + .await +} +/// Poll future `fut` to completion, invoking callback `cb` at the given `threshold` and every +/// `period` afterwards, and also unconditionally when the future completes. +#[inline] +pub async fn monitor_slow_future( + threshold: Duration, + period: Duration, + mut fut: std::pin::Pin<&mut F>, + mut cb: impl FnMut(MonitorSlowFutureCallback), +) -> O +where + F: Future, +{ let started = Instant::now(); let mut attempt = 1; - + let mut last_cb = started; loop { // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common // case where the timeout doesn't fire. - let deadline = started + attempt * threshold; - if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await { - // NB: we check if we exceeded the threshold even if the timeout never fired, because - // scheduling or execution delays may cause the future to succeed even if it exceeds the - // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid - // false negatives. - let elapsed = started.elapsed(); - if elapsed >= threshold { - info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64()); - } + let deadline = started + threshold + (attempt - 1) * period; + // TODO: still call the callback if the future panics? Copy how we do it for the page_service flush_in_progress counter. + let res = tokio::time::timeout_at(deadline, &mut fut).await; + let now = Instant::now(); + let elapsed_total = now - started; + cb(MonitorSlowFutureCallback { + ready: res.is_ok(), + is_slow: elapsed_total >= threshold, + elapsed_total, + elapsed_since_last_callback: now - last_cb, + }); + last_cb = now; + if let Ok(output) = res { return output; } - - let elapsed = started.elapsed().as_secs_f64(); - info!("slow {name} still running after {elapsed:.3}s",); - attempt += 1; } } +/// See [`monitor_slow_future`]. +pub struct MonitorSlowFutureCallback { + /// Whether the future completed. If true, there will be no more callbacks. + pub ready: bool, + /// Whether the future is taking `>=` the specififed threshold duration to complete. + /// Monotonic: if true in one callback invocation, true in all subsequent onces. + pub is_slow: bool, + /// The time elapsed since the [`monitor_slow_future`] was first polled. + pub elapsed_total: Duration, + /// The time elapsed since the last callback invocation. + /// For the initial callback invocation, the time elapsed since the [`monitor_slow_future`] was first polled. + pub elapsed_since_last_callback: Duration, +} + #[cfg(test)] mod tests { use metrics::IntCounterVec; diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index a372be5044..56d97bf8a9 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -48,8 +48,6 @@ pprof.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true -rustls-pemfile.workspace = true -rustls-pki-types.workspace = true rustls.workspace = true scopeguard.workspace = true send-future.workspace = true @@ -70,6 +68,7 @@ tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true +tracing-utils.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index ad8c618b95..d66cc01f7d 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -57,7 +57,8 @@ async fn ingest( tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?; - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); let gate = utils::sync::gate::Gate::default(); diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 830fd8a531..508dac231e 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -7,7 +7,7 @@ use http_utils::error::HttpErrorBody; use pageserver_api::models::*; use pageserver_api::shard::TenantShardId; pub use reqwest::Body as ReqwestBody; -use reqwest::{Certificate, IntoUrl, Method, StatusCode}; +use reqwest::{Certificate, IntoUrl, Method, StatusCode, Url}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -458,13 +458,21 @@ impl Client { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + behavior: Option, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor", self.mgmt_api_endpoint ); + let mut uri = Url::parse(&uri) + .map_err(|e| Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")))?; - self.request(Method::PUT, &uri, ()) + if let Some(behavior) = behavior { + uri.query_pairs_mut() + .append_pair("detach_behavior", &behavior.to_string()); + } + + self.request(Method::PUT, uri, ()) .await? .json() .await diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs index bd8b54a286..565f66ce1a 100644 --- a/pageserver/compaction/tests/tests.rs +++ b/pageserver/compaction/tests/tests.rs @@ -12,7 +12,7 @@ pub(crate) fn setup_logging() { logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, logging::Output::Stdout, ) - .expect("Failed to init test logging") + .expect("Failed to init test logging"); }); } diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index b426f977cf..c49c8b58df 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -131,7 +131,8 @@ async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> R pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let storage_path = &cmd.path; let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES); - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. pageserver::virtual_file::init( diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 05fb35ff09..293c01eff0 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -76,7 +76,8 @@ async fn read_image_file(path: impl AsRef, ctx: &RequestContext) -> Result } pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); match cmd { LayerCmd::List { path } => { for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? { @@ -176,7 +177,8 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { ); pageserver::page_cache::init(100); - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error) + .with_scope_debug_tools(); macro_rules! rewrite_closure { ($($summary_ty:tt)*) => {{ diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 72a120a69b..1d81b839a8 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -208,7 +208,8 @@ async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { virtual_file::SyncMode::Sync, ); page_cache::init(100); - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); dump_layerfile_from_path(path, true, &ctx).await } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index c4af0d5d41..3ab6d79546 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -30,7 +30,6 @@ use pageserver::{ }; use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; -use rustls_pki_types::{CertificateDer, PrivateKeyDer}; use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tokio_util::sync::CancellationToken; @@ -111,6 +110,7 @@ fn main() -> anyhow::Result<()> { } else { TracingErrorLayerEnablement::Disabled }; + logging::init( conf.log_format, tracing_error_layer_enablement, @@ -621,8 +621,8 @@ fn start_pageserver( let https_task = match https_listener { Some(https_listener) => { - let certs = load_certs(&conf.ssl_cert_file)?; - let key = load_private_key(&conf.ssl_key_file)?; + let certs = http_utils::tls_certs::load_cert_chain(&conf.ssl_cert_file)?; + let key = http_utils::tls_certs::load_private_key(&conf.ssl_key_file)?; let server_config = rustls::ServerConfig::builder() .with_no_client_auth() @@ -734,25 +734,6 @@ fn start_pageserver( }) } -fn load_certs(filename: &Utf8Path) -> std::io::Result>> { - let file = std::fs::File::open(filename)?; - let mut reader = std::io::BufReader::new(file); - - rustls_pemfile::certs(&mut reader).collect() -} - -fn load_private_key(filename: &Utf8Path) -> anyhow::Result> { - let file = std::fs::File::open(filename)?; - let mut reader = std::io::BufReader::new(file); - - let key = rustls_pemfile::private_key(&mut reader)?; - - key.ok_or(anyhow::anyhow!( - "no private key found in {}", - filename.as_str(), - )) -} - async fn create_remote_storage_client( conf: &'static PageServerConf, ) -> anyhow::Result { diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index e2a84d0c24..d2caf030df 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -134,6 +134,9 @@ pub(crate) enum Scope { UnitTest { io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, }, + DebugTools { + io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, } static GLOBAL_IO_SIZE_METRICS: Lazy = @@ -195,6 +198,12 @@ impl Scope { io_size_metrics: &GLOBAL_IO_SIZE_METRICS, } } + + pub(crate) fn new_debug_tools() -> Self { + Scope::DebugTools { + io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } + } } /// The kind of access to the page cache. @@ -435,6 +444,12 @@ impl RequestContext { .build() } + pub fn with_scope_debug_tools(&self) -> Self { + RequestContextBuilder::new(TaskKind::DebugTool) + .scope(Scope::new_debug_tools()) + .build() + } + pub fn task_kind(&self) -> TaskKind { self.task_kind } @@ -486,6 +501,7 @@ impl RequestContext { Scope::SecondaryTenant { io_size_metrics } => io_size_metrics, #[cfg(test)] Scope::UnitTest { io_size_metrics } => io_size_metrics, + Scope::DebugTools { io_size_metrics } => io_size_metrics, } } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 0fb9a240d5..e799efcce3 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1079,7 +1079,6 @@ components: - last_record_lsn - disk_consistent_lsn - state - - latest_gc_cutoff_lsn properties: timeline_id: type: string @@ -1123,9 +1122,6 @@ components: min_readable_lsn: type: string format: hex - latest_gc_cutoff_lsn: - type: string - format: hex applied_gc_cutoff_lsn: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e5848bfd25..e8a32ca1ef 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -28,9 +28,9 @@ use hyper::{Body, Request, Response, StatusCode, Uri, header}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::models::{ - DownloadRemoteLayersTaskSpawnRequest, IngestAuxFilesRequest, ListAuxFilesRequest, - LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, LsnLeaseRequest, - OffloadedTimelineInfo, PageTraceEvent, ShardParameters, StatusResponse, + DetachBehavior, DownloadRemoteLayersTaskSpawnRequest, IngestAuxFilesRequest, + ListAuxFilesRequest, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, + LsnLeaseRequest, OffloadedTimelineInfo, PageTraceEvent, ShardParameters, StatusResponse, TenantConfigPatchRequest, TenantConfigRequest, TenantDetails, TenantInfo, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantScanRemoteStorageResponse, TenantScanRemoteStorageShard, TenantShardLocation, TenantShardSplitRequest, @@ -460,10 +460,7 @@ async fn build_timeline_info_common( initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), - // Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally - // we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we - // actually trimmed data to), which can pass each other when PITR is changed. - latest_gc_cutoff_lsn: min_readable_lsn, + _unused: Default::default(), // Unused, for legacy decode only min_readable_lsn, applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(), current_logical_size: current_logical_size.size_dont_care_about_accuracy(), @@ -2394,6 +2391,7 @@ async fn timeline_checkpoint_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); + flags |= CompactFlags::NoYield; // run compaction to completion if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; } @@ -2508,6 +2506,9 @@ async fn timeline_detach_ancestor_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let behavior: Option = parse_query_param(&request, "detach_behavior")?; + + let behavior = behavior.unwrap_or_default(); let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); @@ -2557,7 +2558,7 @@ async fn timeline_detach_ancestor_handler( let ctx = &ctx.with_scope_timeline(&timeline); let progress = timeline - .prepare_to_detach_from_ancestor(&tenant, options, ctx) + .prepare_to_detach_from_ancestor(&tenant, options, behavior, ctx) .await?; // uncomment to allow early as possible Tenant::drop @@ -2572,6 +2573,7 @@ async fn timeline_detach_ancestor_handler( tenant_shard_id, timeline_id, prepared, + behavior, attempt, ctx, ) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index fd90ef8cd7..f7afaae068 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -465,12 +465,40 @@ pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) { pub(crate) static WAIT_LSN_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wait_lsn_seconds", - "Time spent waiting for WAL to arrive", + "Time spent waiting for WAL to arrive. Updated on completion of the wait_lsn operation.", CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); +pub(crate) static WAIT_LSN_START_FINISH_COUNTERPAIR: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_wait_lsn_started_count", + "Number of wait_lsn operations started.", + "pageserver_wait_lsn_finished_count", + "Number of wait_lsn operations finished.", + &["tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + +pub(crate) static WAIT_LSN_IN_PROGRESS_MICROS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_wait_lsn_in_progress_micros", + "Time spent waiting for WAL to arrive, by timeline_id. Updated periodically while waiting.", + &["tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + +pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_wait_lsn_in_progress_micros_global", + "Time spent waiting for WAL to arrive, globally. Updated periodically while waiting." + ) + .expect("failed to define a metric") +}); + static FLUSH_WAIT_UPLOAD_TIME: Lazy = Lazy::new(|| { register_gauge_vec!( "pageserver_flush_wait_upload_seconds", @@ -2830,7 +2858,6 @@ impl StorageTimeMetrics { } } -#[derive(Debug)] pub(crate) struct TimelineMetrics { tenant_id: String, shard_id: String, @@ -2863,6 +2890,8 @@ pub(crate) struct TimelineMetrics { pub valid_lsn_lease_count_gauge: UIntGauge, pub wal_records_received: IntCounter, pub storage_io_size: StorageIoSizeMetrics, + pub wait_lsn_in_progress_micros: GlobalAndPerTenantIntCounter, + pub wait_lsn_start_finish_counterpair: IntCounterPair, shutdown: std::sync::atomic::AtomicBool, } @@ -3000,6 +3029,17 @@ impl TimelineMetrics { let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id); + let wait_lsn_in_progress_micros = GlobalAndPerTenantIntCounter { + global: WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS.clone(), + per_tenant: WAIT_LSN_IN_PROGRESS_MICROS + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(), + }; + + let wait_lsn_start_finish_counterpair = WAIT_LSN_START_FINISH_COUNTERPAIR + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + TimelineMetrics { tenant_id, shard_id, @@ -3032,6 +3072,8 @@ impl TimelineMetrics { storage_io_size, valid_lsn_lease_count_gauge, wal_records_received, + wait_lsn_in_progress_micros, + wait_lsn_start_finish_counterpair, shutdown: std::sync::atomic::AtomicBool::default(), } } @@ -3224,6 +3266,15 @@ impl TimelineMetrics { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } + let _ = + WAIT_LSN_IN_PROGRESS_MICROS.remove_label_values(&[tenant_id, shard_id, timeline_id]); + + { + let mut res = [Ok(()), Ok(())]; + WAIT_LSN_START_FINISH_COUNTERPAIR + .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id]); + } + let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), tenant_id, @@ -3836,27 +3887,29 @@ pub mod tokio_epoll_uring { }); } +pub(crate) struct GlobalAndPerTenantIntCounter { + global: IntCounter, + per_tenant: IntCounter, +} + +impl GlobalAndPerTenantIntCounter { + #[inline(always)] + pub(crate) fn inc(&self) { + self.inc_by(1) + } + #[inline(always)] + pub(crate) fn inc_by(&self, n: u64) { + self.global.inc_by(n); + self.per_tenant.inc_by(n); + } +} + pub(crate) mod tenant_throttling { - use metrics::{IntCounter, register_int_counter_vec}; + use metrics::register_int_counter_vec; use once_cell::sync::Lazy; use utils::shard::TenantShardId; - pub(crate) struct GlobalAndPerTenantIntCounter { - global: IntCounter, - per_tenant: IntCounter, - } - - impl GlobalAndPerTenantIntCounter { - #[inline(always)] - pub(crate) fn inc(&self) { - self.inc_by(1) - } - #[inline(always)] - pub(crate) fn inc_by(&self, n: u64) { - self.global.inc_by(n); - self.per_tenant.inc_by(n); - } - } + use super::GlobalAndPerTenantIntCounter; pub(crate) struct Metrics { pub(super) count_accounted_start: GlobalAndPerTenantIntCounter, @@ -4102,6 +4155,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { &CIRCUIT_BREAKERS_BROKEN, &CIRCUIT_BREAKERS_UNBROKEN, &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL, + &WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS, ] .into_iter() .for_each(|c| { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f2d2ab05ad..94571cbaaa 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1106,12 +1106,19 @@ impl PageServerHandler { }; // Dispatch the batch to the appropriate request handler. - let (mut handler_results, span) = log_slow( - batch.as_static_str(), - LOG_SLOW_GETPAGE_THRESHOLD, - self.pagestream_dispatch_batched_message(batch, io_concurrency, ctx), - ) - .await?; + let log_slow_name = batch.as_static_str(); + let (mut handler_results, span) = { + // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and + // won't fit on the stack. + let mut boxpinned = + Box::pin(self.pagestream_dispatch_batched_message(batch, io_concurrency, ctx)); + log_slow( + log_slow_name, + LOG_SLOW_GETPAGE_THRESHOLD, + boxpinned.as_mut(), + ) + .await? + }; // We purposefully don't count flush time into the smgr operation timer. // diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 62e1cdac0c..55b5704d67 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5754,7 +5754,7 @@ pub(crate) mod harness { logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, logging::Output::Stdout, ) - .expect("Failed to init test logging") + .expect("Failed to init test logging"); }); } @@ -6559,7 +6559,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; let mut writer = tline.writer().await; @@ -6576,7 +6580,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; let mut writer = tline.writer().await; @@ -6593,7 +6601,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; let mut writer = tline.writer().await; @@ -6610,7 +6622,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; assert_eq!( @@ -6693,7 +6709,9 @@ mod tests { timeline.freeze_and_flush().await?; if compact { // this requires timeline to be &Arc - timeline.compact(&cancel, EnumSet::empty(), ctx).await?; + timeline + .compact(&cancel, CompactFlags::NoYield.into(), ctx) + .await?; } // this doesn't really need to use the timeline_id target, but it is closer to what it @@ -7020,6 +7038,7 @@ mod tests { child_timeline.freeze_and_flush().await?; let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); child_timeline .compact(&CancellationToken::new(), flags, &ctx) .await?; @@ -7398,7 +7417,9 @@ mod tests { // Perform a cycle of flush, compact, and GC tline.freeze_and_flush().await?; - tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tline + .compact(&cancel, CompactFlags::NoYield.into(), &ctx) + .await?; tenant .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; @@ -7727,6 +7748,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags } else { EnumSet::empty() @@ -7777,7 +7799,9 @@ mod tests { let before_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); - tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tline + .compact(&cancel, CompactFlags::NoYield.into(), &ctx) + .await?; let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); @@ -7893,7 +7917,6 @@ mod tests { Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) } - #[allow(clippy::needless_range_loop)] for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = (blknum * STEP) as u32; @@ -7943,6 +7966,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8405,6 +8429,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8472,6 +8497,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags }, &ctx, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 334fb04604..4308db84e5 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -219,7 +219,11 @@ impl LocationConf { }; let shard = if conf.shard_count == 0 { - ShardIdentity::unsharded() + // NB: carry over the persisted stripe size instead of using the default. This doesn't + // matter for most practical purposes, since unsharded tenants don't use the stripe + // size, but can cause inconsistencies between storcon and Pageserver and cause manual + // splits without `new_stripe_size` to use an unintended stripe size. + ShardIdentity::unsharded_with_stripe_size(ShardStripeSize(conf.shard_stripe_size)) } else { ShardIdentity::new( ShardNumber(conf.shard_number), diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 77f9a3579d..dceae89d1c 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -300,9 +300,8 @@ impl TimelineMetadata { /// Returns true if anything was changed pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { - if let Some(ancestor) = self.body.ancestor_timeline { - assert_eq!(ancestor, branchpoint.0); - } + // Detaching from ancestor now doesn't always detach directly to the direct ancestor, but we + // ensure the LSN is the same. So we don't check the timeline ID. if self.body.ancestor_lsn != Lsn(0) { assert_eq!(self.body.ancestor_lsn, branchpoint.1); } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 003f84e640..f02247950f 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -14,7 +14,7 @@ use futures::StreamExt; use itertools::Itertools; use once_cell::sync::Lazy; use pageserver_api::key::Key; -use pageserver_api::models::LocationConfigMode; +use pageserver_api::models::{DetachBehavior, LocationConfigMode}; use pageserver_api::shard::{ ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, }; @@ -1914,6 +1914,7 @@ impl TenantManager { tenant_shard_id: TenantShardId, timeline_id: TimelineId, prepared: PreparedTimelineDetach, + behavior: DetachBehavior, mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, ) -> Result, detach_ancestor::Error> { @@ -1957,7 +1958,14 @@ impl TenantManager { .map_err(Error::NotFound)?; let resp = timeline - .detach_from_ancestor_and_reparent(&tenant, prepared, ctx) + .detach_from_ancestor_and_reparent( + &tenant, + prepared, + attempt.ancestor_timeline_id, + attempt.ancestor_lsn, + behavior, + ctx, + ) .await?; let mut slot_guard = slot_guard; diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 6aba75fa56..854a18cec1 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -18,7 +18,7 @@ use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; -use tracing::{info_span, warn}; +use tracing::warn; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; use utils::{backoff, pausable_failpoint}; @@ -230,7 +230,7 @@ async fn download_object( || IoBufferMut::with_capacity(super::BUFFER_SIZE), gate.enter().map_err(|_| DownloadError::Cancelled)?, ctx, - info_span!(parent: None, "download_object_buffered_writer", %dst_path), + tracing::info_span!(parent: None, "download_object_buffered_writer", %dst_path), ); // TODO: use vectored write (writev) once supported by tokio-epoll-uring. diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 589ac5ae88..034e5f8c91 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -268,7 +268,7 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { error_run += 1; let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); - log_compaction_error(&err, error_run, backoff, cancel.is_cancelled()); + log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled()); continue; } } @@ -281,10 +281,9 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } } -fn log_compaction_error( +pub(crate) fn log_compaction_error( err: &CompactionError, - error_count: u32, - sleep_duration: Duration, + retry_info: Option<(u32, Duration)>, task_cancelled: bool, ) { use CompactionError::*; @@ -318,14 +317,26 @@ fn log_compaction_error( } }; - match level { - Level::ERROR => { - error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}") + if let Some((error_count, sleep_duration)) = retry_info { + match level { + Level::ERROR => { + error!( + "Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}" + ) + } + Level::INFO => { + info!( + "Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}" + ) + } + level => unimplemented!("unexpected level {level:?}"), } - Level::INFO => { - info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}") + } else { + match level { + Level::ERROR => error!("Compaction failed: {err:#}"), + Level::INFO => info!("Compaction failed: {err:#}"), + level => unimplemented!("unexpected level {level:?}"), } - level => unimplemented!("unexpected level {level:?}"), } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c78c17c9bb..8ea4549304 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -45,8 +45,9 @@ use pageserver_api::key::{ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}; use pageserver_api::models::{ CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState, + DetachBehavior, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, + EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, + TimelineState, }; use pageserver_api::reltag::{BlockNumber, RelTag}; use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; @@ -67,6 +68,7 @@ use tracing::*; use utils::generation::Generation; use utils::guard_arc_swap::GuardArcSwap; use utils::id::TimelineId; +use utils::logging::{MonitorSlowFutureCallback, monitor_slow_future}; use utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use utils::postgres_client::PostgresClientProtocol; use utils::rate_limit::RateLimit; @@ -87,6 +89,7 @@ use super::remote_timeline_client::index::{GcCompactionState, IndexPart}; use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use super::secondary::heatmap::HeatMapLayer; use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; +use super::tasks::log_compaction_error; use super::upload_queue::NotInitialized; use super::{ AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, @@ -439,6 +442,8 @@ pub struct Timeline { heatmap_layers_downloader: Mutex>, pub(crate) rel_size_v2_status: ArcSwapOption, + + wait_lsn_log_slow: tokio::sync::Semaphore, } pub(crate) enum PreviousHeatmap { @@ -1479,17 +1484,67 @@ impl Timeline { WaitLsnTimeout::Default => self.conf.wait_lsn_timeout, }; - let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); + let timer = crate::metrics::WAIT_LSN_TIME.start_timer(); + let start_finish_counterpair_guard = self.metrics.wait_lsn_start_finish_counterpair.guard(); - match self.last_record_lsn.wait_for_timeout(lsn, timeout).await { + let wait_for_timeout = self.last_record_lsn.wait_for_timeout(lsn, timeout); + let wait_for_timeout = std::pin::pin!(wait_for_timeout); + // Use threshold of 1 because even 1 second of wait for ingest is very much abnormal. + let log_slow_threshold = Duration::from_secs(1); + // Use period of 10 to avoid flooding logs during an outage that affects all timelines. + let log_slow_period = Duration::from_secs(10); + let mut logging_permit = None; + let wait_for_timeout = monitor_slow_future( + log_slow_threshold, + log_slow_period, + wait_for_timeout, + |MonitorSlowFutureCallback { + ready, + is_slow, + elapsed_total, + elapsed_since_last_callback, + }| { + self.metrics + .wait_lsn_in_progress_micros + .inc_by(u64::try_from(elapsed_since_last_callback.as_micros()).unwrap()); + if !is_slow { + return; + } + // It's slow, see if we should log it. + // (We limit the logging to one per invocation per timeline to avoid excessive + // logging during an extended broker / networking outage that affects all timelines.) + if logging_permit.is_none() { + logging_permit = self.wait_lsn_log_slow.try_acquire().ok(); + } + if logging_permit.is_none() { + return; + } + // We log it. + if ready { + info!( + "slow wait_lsn completed after {:.3}s", + elapsed_total.as_secs_f64() + ); + } else { + info!( + "slow wait_lsn still running for {:.3}s", + elapsed_total.as_secs_f64() + ); + } + }, + ); + let res = wait_for_timeout.await; + // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo + drop(logging_permit); + drop(start_finish_counterpair_guard); + drop(timer); + match res { Ok(()) => Ok(()), Err(e) => { use utils::seqwait::SeqWaitError::*; match e { Shutdown => Err(WaitLsnError::Shutdown), Timeout => { - // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo - drop(_timer); let walreceiver_status = self.walreceiver_status(); Err(WaitLsnError::Timeout(format!( "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", @@ -1802,18 +1857,23 @@ impl Timeline { flags: EnumSet, ctx: &RequestContext, ) -> Result { - self.compact_with_options( - cancel, - CompactOptions { - flags, - compact_key_range: None, - compact_lsn_range: None, - sub_compaction: false, - sub_compaction_max_job_size_mb: None, - }, - ctx, - ) - .await + let res = self + .compact_with_options( + cancel, + CompactOptions { + flags, + compact_key_range: None, + compact_lsn_range: None, + sub_compaction: false, + sub_compaction_max_job_size_mb: None, + }, + ctx, + ) + .await; + if let Err(err) = &res { + log_compaction_error(err, None, cancel.is_cancelled()); + } + res } /// Outermost timeline compaction operation; downloads needed layers. @@ -2423,8 +2483,9 @@ impl Timeline { } fn get_l0_flush_delay_threshold(&self) -> Option { - // Disable L0 flushes by default. This and compaction needs further tuning. - const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3 + // By default, delay L0 flushes at 3x the compaction threshold. The compaction threshold + // defaults to 10, and L0 compaction is generally able to keep L0 counts below 30. + const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 3; // If compaction is disabled, don't delay. if self.get_compaction_period() == Duration::ZERO { @@ -2452,8 +2513,9 @@ impl Timeline { } fn get_l0_flush_stall_threshold(&self) -> Option { - // Disable L0 stalls by default. In ingest benchmarks, we see image compaction take >10 - // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long. + // Disable L0 stalls by default. Stalling can cause unavailability if L0 compaction isn't + // responsive, and it can e.g. block on other compaction via the compaction semaphore or + // sibling timelines. We need more confidence before enabling this. const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 0; // TODO: default to e.g. 5 // If compaction is disabled, don't stall. @@ -2821,6 +2883,8 @@ impl Timeline { heatmap_layers_downloader: Mutex::new(None), rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status), + + wait_lsn_log_slow: tokio::sync::Semaphore::new(1), }; result.repartition_threshold = @@ -5394,9 +5458,10 @@ impl Timeline { self: &Arc, tenant: &crate::tenant::Tenant, options: detach_ancestor::Options, + behavior: DetachBehavior, ctx: &RequestContext, ) -> Result { - detach_ancestor::prepare(self, tenant, options, ctx).await + detach_ancestor::prepare(self, tenant, behavior, options, ctx).await } /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and @@ -5412,9 +5477,21 @@ impl Timeline { self: &Arc, tenant: &crate::tenant::Tenant, prepared: detach_ancestor::PreparedTimelineDetach, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, + behavior: DetachBehavior, ctx: &RequestContext, ) -> Result { - detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await + detach_ancestor::detach_and_reparent( + self, + tenant, + prepared, + ancestor_timeline_id, + ancestor_lsn, + behavior, + ctx, + ) + .await } /// Final step which unblocks the GC. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 610c706bd8..5189ae418c 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -56,6 +56,7 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::{ AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, }; +use crate::tenant::tasks::log_compaction_error; use crate::tenant::timeline::{ DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, drop_rlock, @@ -440,6 +441,20 @@ impl GcCompactionQueue { ctx: &RequestContext, gc_block: &GcBlock, timeline: &Arc, + ) -> Result { + let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await; + if let Err(err) = &res { + log_compaction_error(err, None, cancel.is_cancelled()); + } + res + } + + async fn iteration_inner( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + gc_block: &GcBlock, + timeline: &Arc, ) -> Result { let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { return Err(CompactionError::AlreadyRunning( @@ -3196,7 +3211,11 @@ impl Timeline { } // TODO: move the below part to the loop body - let last_key = last_key.expect("no keys produced during compaction"); + let Some(last_key) = last_key else { + return Err(CompactionError::Other(anyhow!( + "no keys produced during compaction" + ))); + }; stat.on_unique_key_visited(); let retention = self diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 81d94105ee..0014ab9113 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::Context; use http_utils::error::ApiError; +use pageserver_api::models::DetachBehavior; use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::shard::ShardIdentity; use tokio::sync::Semaphore; @@ -32,6 +33,9 @@ pub(crate) enum Error { #[error("too many ancestors")] TooManyAncestors, + #[error("ancestor is not empty")] + AncestorNotEmpty, + #[error("shutting down, please retry later")] ShuttingDown, @@ -89,7 +93,9 @@ impl From for ApiError { fn from(value: Error) -> Self { match value { Error::NoAncestor => ApiError::Conflict(value.to_string()), - Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")), + Error::TooManyAncestors | Error::AncestorNotEmpty => { + ApiError::BadRequest(anyhow::anyhow!("{value}")) + } Error::ShuttingDown => ApiError::ShuttingDown, Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")), Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => { @@ -127,7 +133,7 @@ pub(crate) struct PreparedTimelineDetach { layers: Vec, } -/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. +// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. #[derive(Debug)] pub(crate) struct Options { pub(crate) rewrite_concurrency: std::num::NonZeroUsize, @@ -147,7 +153,8 @@ impl Default for Options { #[derive(Debug)] pub(crate) struct Attempt { pub(crate) timeline_id: TimelineId, - + pub(crate) ancestor_timeline_id: TimelineId, + pub(crate) ancestor_lsn: Lsn, _guard: completion::Completion, gate_entered: Option, } @@ -167,25 +174,30 @@ impl Attempt { pub(super) async fn prepare( detached: &Arc, tenant: &Tenant, + behavior: DetachBehavior, options: Options, ctx: &RequestContext, ) -> Result { use Error::*; - let Some((ancestor, ancestor_lsn)) = detached + let Some((mut ancestor, mut ancestor_lsn)) = detached .ancestor_timeline .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { + let ancestor_id; + let ancestor_lsn; let still_in_progress = { let accessor = detached.remote_client.initialized_upload_queue()?; // we are safe to inspect the latest uploaded, because we can only witness this after // restart is complete and ancestor is no more. let latest = accessor.latest_uploaded_index_part(); - if latest.lineage.detached_previous_ancestor().is_none() { + let Some((id, lsn)) = latest.lineage.detached_previous_ancestor() else { return Err(NoAncestor); }; + ancestor_id = id; + ancestor_lsn = lsn; latest .gc_blocking @@ -196,7 +208,8 @@ pub(super) async fn prepare( if still_in_progress { // gc is still blocked, we can still reparent and complete. // we are safe to reparent remaining, because they were locked in in the beginning. - let attempt = continue_with_blocked_gc(detached, tenant).await?; + let attempt = + continue_with_blocked_gc(detached, tenant, ancestor_id, ancestor_lsn).await?; // because the ancestor of detached is already set to none, we have published all // of the layers, so we are still "prepared." @@ -224,13 +237,34 @@ pub(super) async fn prepare( check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; - if ancestor.ancestor_timeline.is_some() { + if let DetachBehavior::MultiLevelAndNoReparent = behavior { + // If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline. + while let Some(ancestor_of_ancestor) = ancestor.ancestor_timeline.clone() { + if ancestor_lsn != ancestor.ancestor_lsn { + // non-technical requirement; we could flatten still if ancestor LSN does not match but that needs + // us to copy and cut more layers. + return Err(AncestorNotEmpty); + } + // Use the ancestor of the ancestor as the new ancestor (only when the ancestor LSNs are the same) + ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable + ancestor = ancestor_of_ancestor; + // TODO: do we still need to check if we don't want to reparent? + check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; + } + } else if ancestor.ancestor_timeline.is_some() { // non-technical requirement; we could flatten N ancestors just as easily but we chose // not to, at least initially return Err(TooManyAncestors); } - let attempt = start_new_attempt(detached, tenant).await?; + tracing::info!( + "attempt to detach the timeline from the ancestor: {}@{}, behavior={:?}", + ancestor.timeline_id, + ancestor_lsn, + behavior + ); + + let attempt = start_new_attempt(detached, tenant, ancestor.timeline_id, ancestor_lsn).await?; utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable"); @@ -450,8 +484,13 @@ pub(super) async fn prepare( Ok(Progress::Prepared(attempt, prepared)) } -async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result { - let attempt = obtain_exclusive_attempt(detached, tenant)?; +async fn start_new_attempt( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { + let attempt = obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn)?; // insert the block in the index_part.json, if not already there. let _dont_care = tenant @@ -466,13 +505,23 @@ async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result Result { +async fn continue_with_blocked_gc( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { // FIXME: it would be nice to confirm that there is an in-memory version, since we've just // verified there is a persistent one? - obtain_exclusive_attempt(detached, tenant) + obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn) } -fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result { +fn obtain_exclusive_attempt( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { use Error::{OtherTimelineDetachOngoing, ShuttingDown}; // ensure we are the only active attempt for this tenant @@ -493,6 +542,8 @@ fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result, tenant: &Tenant, prepared: PreparedTimelineDetach, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, + behavior: DetachBehavior, _ctx: &RequestContext, ) -> Result { let PreparedTimelineDetach { layers } = prepared; @@ -823,7 +877,30 @@ pub(super) async fn detach_and_reparent( "cannot (detach? reparent)? complete if the operation is not still ongoing" ); - let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) { + let ancestor_to_detach = match detached.ancestor_timeline.as_ref() { + Some(mut ancestor) => { + while ancestor.timeline_id != ancestor_timeline_id { + match ancestor.ancestor_timeline.as_ref() { + Some(found) => { + if ancestor_lsn != ancestor.ancestor_lsn { + return Err(Error::DetachReparent(anyhow::anyhow!( + "cannot find the ancestor timeline to detach from: wrong ancestor lsn" + ))); + } + ancestor = found; + } + None => { + return Err(Error::DetachReparent(anyhow::anyhow!( + "cannot find the ancestor timeline to detach from" + ))); + } + } + } + Some(ancestor) + } + None => None, + }; + let ancestor = match (ancestor_to_detach, recorded_branchpoint) { (Some(ancestor), None) => { assert!( !layers.is_empty(), @@ -896,6 +973,11 @@ pub(super) async fn detach_and_reparent( Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false), }; + if let DetachBehavior::MultiLevelAndNoReparent = behavior { + // Do not reparent if the user requests to behave so. + return Ok(DetachingAndReparenting::Reparented(HashSet::new())); + } + let mut tasks = tokio::task::JoinSet::new(); // Returns a single permit semaphore which will be used to make one reparenting succeed, @@ -1033,6 +1115,11 @@ pub(super) async fn complete( } /// Query against a locked `Tenant::timelines`. +/// +/// A timeline is reparentable if: +/// +/// - It is not the timeline being detached. +/// - It has the same ancestor as the timeline being detached. Note that the ancestor might not be the direct ancestor. fn reparentable_timelines<'a, I>( timelines: I, detached: &'a Arc, diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 0414661a5f..78e42191a4 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -76,6 +76,10 @@ #include "access/xlogrecovery.h" #endif +#if PG_VERSION_NUM < 160000 +typedef PGAlignedBlock PGIOAlignedBlock; +#endif + /* * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API * calls to md.c, and *also* do the calls to the Page Server. On every @@ -1803,7 +1807,7 @@ static XLogRecPtr log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std) { - PGAlignedBlock copied_buffer; + PGIOAlignedBlock copied_buffer; memcpy(copied_buffer.data, page, BLCKSZ); return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); @@ -1820,7 +1824,7 @@ static XLogRecPtr log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, BlockNumber nblocks, Page *pages, bool page_std) { - PGAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; + PGIOAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; BlockNumber blknos[XLR_MAX_BLOCK_ID]; Page pageptrs[XLR_MAX_BLOCK_ID]; int nregistered = 0; @@ -1858,7 +1862,7 @@ log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, static bool PageIsEmptyHeapPage(char *buffer) { - PGAlignedBlock empty_page; + PGIOAlignedBlock empty_page; PageInit((Page) empty_page.data, BLCKSZ, 0); @@ -2774,6 +2778,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdextend(reln, forkNum, blkno, buffer, skipFsync); + /* Update LFC in case of unlogged index build */ + if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) + lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); return; default: @@ -2847,7 +2854,7 @@ static void neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, int nblocks, bool skipFsync) { - const PGAlignedBlock buffer = {0}; + const PGIOAlignedBlock buffer = {0}; int remblocks = nblocks; XLogRecPtr lsn = 0; @@ -2862,6 +2869,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); + /* Update LFC in case of unlogged index build */ + if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) + { + for (int i = 0; i < nblocks; i++) + { + lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); + } + } return; default: @@ -2894,6 +2909,11 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, relpath(reln->smgr_rlocator, forkNum), InvalidBlockNumber))); +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); +#endif + /* Don't log any pages if we're not allowed to do so. */ if (!XLogInsertAllowed()) return; @@ -3389,15 +3409,16 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; - char mdbuf[BLCKSZ]; - char mdbuf_masked[BLCKSZ]; + PGIOAlignedBlock mdbuf; + PGIOAlignedBlock mdbuf_masked; + XLogRecPtr request_lsn = request_lsns.request_lsn; - mdread(reln, forkNum, blkno, mdbuf); + mdread(reln, forkNum, blkno, mdbuf.data); memcpy(pageserver_masked, buffer, BLCKSZ); - memcpy(mdbuf_masked, mdbuf, BLCKSZ); + memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); - if (PageIsNew((Page) mdbuf)) + if (PageIsNew((Page) mdbuf.data)) { if (!PageIsNew((Page) pageserver_masked)) { @@ -3416,41 +3437,41 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf)); + hexdump_page(mdbuf.data)); } - else if (PageGetSpecialSize(mdbuf) == 0) + else if (PageGetSpecialSize(mdbuf.data) == 0) { /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } - else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) { /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } @@ -3542,77 +3563,85 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL - if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; - char mdbuf[BLCKSZ]; - char mdbuf_masked[BLCKSZ]; + PGIOAlignedBlock mdbuf; + PGIOAlignedBlock mdbuf_masked; + XLogRecPtr request_lsn = request_lsns->request_lsn; for (int i = 0; i < nblocks; i++) { + BlockNumber blkno = blocknum + i; + if (!BITMAP_ISSET(read, i)) + continue; + #if PG_MAJORVERSION_NUM >= 17 - mdreadv(reln, forkNum, blkno + i, &mdbuf, 1); + { + void* mdbuffers[1] = { mdbuf.data }; + mdreadv(reln, forknum, blkno, mdbuffers, 1); + } #else - mdread(reln, forkNum, blkno + i, mdbuf); + mdread(reln, forknum, blkno, mdbuf.data); #endif - memcpy(pageserver_masked, buffer, BLCKSZ); - memcpy(mdbuf_masked, mdbuf, BLCKSZ); + memcpy(pageserver_masked, buffers[i], BLCKSZ); + memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); - if (PageIsNew((Page) mdbuf)) + if (PageIsNew((Page) mdbuf.data)) { if (!PageIsNew((Page) pageserver_masked)) { neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(buffer)); + hexdump_page(buffers[i])); } } - else if (PageIsNew((Page) buffer)) + else if (PageIsNew((Page) buffers[i])) { neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf)); + hexdump_page(mdbuf.data)); } - else if (PageGetSpecialSize(mdbuf) == 0) + else if (PageGetSpecialSize(mdbuf.data) == 0) { /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } - else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) { /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } @@ -3664,6 +3693,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo switch (reln->smgr_relpersistence) { case 0: +#ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) { @@ -3682,6 +3712,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo */ return; } +#endif break; case RELPERSISTENCE_PERMANENT: @@ -3694,6 +3725,9 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif + /* Update LFC in case of unlogged index build */ + if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) + lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -3732,6 +3766,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, switch (reln->smgr_relpersistence) { case 0: +#ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) { @@ -3747,6 +3782,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, */ return; } +#endif break; case RELPERSISTENCE_PERMANENT: @@ -3755,6 +3791,9 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); + /* Update LFC in case of unlogged index build */ + if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) + lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -3768,7 +3807,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) - mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); #endif } @@ -4154,8 +4193,10 @@ neon_start_unlogged_build(SMgrRelation reln) * FIXME: should we pass isRedo true to create the tablespace dir if it * doesn't exist? Is it needed? */ - if (!IsParallelWorker()) +#ifndef DEBUG_COMPARE_LOCAL + if (!IsParallelWorker()) mdcreate(reln, MAIN_FORKNUM, false); +#endif } /* @@ -4230,8 +4271,10 @@ neon_end_unlogged_build(SMgrRelation reln) forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); mdclose(reln, forknum); +#ifndef DEBUG_COMPARE_LOCAL /* use isRedo == true, so that we drop it immediately */ mdunlink(rinfob, forknum, true); +#endif } } diff --git a/poetry.lock b/poetry.lock index 03aa543b06..7c84b2969b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1491,14 +1491,38 @@ files = [ [[package]] name = "jsonnet" -version = "0.20.0" -description = "Python bindings for Jsonnet - The data templating language" +version = "0.21.0rc2" +description = "Python bindings for Jsonnet - The data templating language " optional = false python-versions = "*" groups = ["main"] -markers = "python_version < \"3.13\"" files = [ - {file = "jsonnet-0.20.0.tar.gz", hash = "sha256:7e770c7bf3a366b97b650a39430450f77612e74406731eb75c5bd59f3f104d4f"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8779ac6820fee44ef736df2baedc3ae93e8cd5d672ee105015c2a47fe627a727"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:99affe8c71e2551465064a8039bb3d1cba27a0b73b2b9ff1b652e06f17d4ea8b"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a9dffb9aa01013d100ddfb7230d1eeb80f2a8eef712b1825a60cad57106d8bd"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cca6c95f2879dcab52650b7aa09a4e82a139b084931b1f6f8c840f834fecc08a"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-win_amd64.whl", hash = "sha256:016d6afdb302a6d00bf3bce6a0c3d9c093b992e33f9bc67c64a868035892258e"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e893ab2c9bf10d8ec9e9b0cee8961879c88d0619cc6d8f75ea284a78e06ae32b"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06b353cd3daa2781e6cd308e05f2f116396376994bcb5f59aaadbc6a752c7f2"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eb2bc8e62b73101329072da322f7e2a1bdb3ac530b94669128d1b480e311e55"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:113766fd0c25620807bcf04d4c739f461c971a4f0e4aece9ba62b4e762de9598"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-win_amd64.whl", hash = "sha256:8dab208c2c2760be60f87d1ceb8b28c86b51ed0e31129a7d90cd5fe890b41225"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:95f5b9dd26a41d6f258d1baa8d22e557051beeed8c52a6202584f1becca9dcb5"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cecc6d76e2b377260fae0a060097c113e6ac361b8f739903ea7f3f5f64cdebdf"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aaa2d18224af7e63872ef4a101e93962505456cf5f5439c3cfc25dad6845f8b1"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2a9063f811554487ed552445e964aeec969cafb266b965029c8d6b091ce47950"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-win_amd64.whl", hash = "sha256:80d171182c169761f744ba50068a4ad35d48e52b91d25bf4c7bb9a72f0a04f71"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3657938f87cb6bc6da20ca631d437b5faf469ca060a7c7def9c8fd2f25a5e06"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3dcebc30cb991b58bc416ee05e9387004d04716d5c0b89714ff042bd069af5c8"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ac52c95482df3ed93c908468ca2f40d4825b6baba284b395ddc47bd663b8c3a"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b34450823a7a1861de892fef9f29de1b4c19e1a79e27d81ffe7e57646cc89d6"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-win_amd64.whl", hash = "sha256:573fd2580e46f4875ec505f1732f9e804b7063cba790342ed6fdafe9a6b30556"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:871ca1411de3626499bda60b330d37f85a592918f99ba4809089bbb8d4f5bfe4"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5d33b25a9c5bf9099100b9b16cb385a2876d891fbe639ee9d476fc75c861903a"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2bac374565c7f89a4675f19fd2b624ed1376519267f4e444f49b6fc0368f6e5"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:fab7bbd88f9159f88a7350701a97bda24de9e3b9eef14c2501ba8b9224160d60"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-win_amd64.whl", hash = "sha256:ed71ffba0fd233a1bca7b0f7be79730792c5383e562a9dc7da152478d9ee1612"}, + {file = "jsonnet-0.21.0rc2.tar.gz", hash = "sha256:2b83ec4b5a771c3732e0972be23a71f042ad2940db6918d3a52aade69bc394fb"}, ] [[package]] @@ -3820,4 +3844,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "010ffce959bb256880ab5a267048c182e4612b3151f9a94e3bf5d3a7807962fe" +content-hash = "715fc8c896dcfa1b15054deeddcdec557ef93af91b26e1c8e4688fe4dbef5296" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index b6e3f03a81..2cec510d82 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -70,8 +70,9 @@ reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true rustc-hash.workspace = true -rustls-pemfile.workspace = true rustls.workspace = true +rustls-native-certs.workspace = true +rustls-pemfile.workspace = true scopeguard.workspace = true serde.workspace = true serde_json.workspace = true @@ -99,8 +100,7 @@ url.workspace = true urlencoding.workspace = true utils.workspace = true uuid.workspace = true -rustls-native-certs.workspace = true -x509-parser.workspace = true +x509-cert.workspace = true redis.workspace = true zerocopy.workspace = true diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index dedd225cba..ee7f6ffcd7 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, bail, ensure}; +use arc_swap::ArcSwapOption; use camino::{Utf8Path, Utf8PathBuf}; use clap::Parser; use compute_api::spec::LocalProxySpec; @@ -27,6 +28,7 @@ use crate::config::{ }; use crate::control_plane::locks::ApiLocks; use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use crate::ext::TaskExt; use crate::http::health_server::AppMetrics; use crate::intern::RoleNameInt; use crate::metrics::{Metrics, ThreadPoolMetrics}; @@ -190,7 +192,11 @@ pub async fn run() -> anyhow::Result<()> { // 2. The config file is written but the signal hook is not yet received // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. refresh_config_notify.notify_one(); - tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); + tokio::spawn(refresh_config_loop( + config, + args.config_path, + refresh_config_notify, + )); maintenance_tasks.spawn(crate::http::health_server::task_main( metrics_listener, @@ -269,7 +275,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }; Ok(Box::leak(Box::new(ProxyConfig { - tls_config: None, + tls_config: ArcSwapOption::from(None), metric_collection: None, http_config, authentication_config: AuthenticationConfig { @@ -311,14 +317,16 @@ enum RefreshConfigError { Parse(#[from] serde_json::Error), #[error(transparent)] Validate(anyhow::Error), + #[error(transparent)] + Tls(anyhow::Error), } -async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { +async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc) { let mut init = true; loop { rx.notified().await; - match refresh_config_inner(&path).await { + match refresh_config_inner(config, &path).await { Ok(()) => {} // don't log for file not found errors if this is the first time we are checking // for computes that don't use local_proxy, this is not an error. @@ -327,6 +335,9 @@ async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { { debug!(error=?e, ?path, "could not read config file"); } + Err(RefreshConfigError::Tls(e)) => { + error!(error=?e, ?path, "could not read TLS certificates"); + } Err(e) => { error!(error=?e, ?path, "could not read config file"); } @@ -336,7 +347,10 @@ async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { } } -async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> { +async fn refresh_config_inner( + config: &ProxyConfig, + path: &Utf8Path, +) -> Result<(), RefreshConfigError> { let bytes = tokio::fs::read(&path).await?; let data: LocalProxySpec = serde_json::from_slice(&bytes)?; @@ -406,5 +420,20 @@ async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> info!("successfully loaded new config"); JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); + if let Some(tls_config) = data.tls { + let tls_config = tokio::task::spawn_blocking(move || { + crate::tls::server_config::configure_tls( + &tls_config.key_path, + &tls_config.cert_path, + None, + false, + ) + }) + .await + .propagate_task_panic() + .map_err(RefreshConfigError::Tls)?; + config.tls_config.store(Some(Arc::new(tls_config))); + } + Ok(()) } diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index eec0bf8f99..feca5ccf88 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use std::time::Duration; use anyhow::bail; +use arc_swap::ArcSwapOption; use futures::future::Either; use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; @@ -563,6 +564,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; + let tls_config = ArcSwapOption::from(tls_config.map(Arc::new)); let backup_metric_collection_config = config::MetricBackupCollectionConfig { remote_storage_config: args.metric_backup_collection_remote_storage.clone(), diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1bcd22e98f..ad398c122c 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Ok, bail, ensure}; +use arc_swap::ArcSwapOption; use clap::ValueEnum; use remote_storage::RemoteStorageConfig; @@ -17,7 +18,7 @@ pub use crate::tls::server_config::{TlsConfig, configure_tls}; use crate::types::Host; pub struct ProxyConfig { - pub tls_config: Option, + pub tls_config: ArcSwapOption, pub metric_collection: Option, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 4662860b3f..1156545f34 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -177,7 +177,8 @@ pub(crate) async fn handle_client( let proto = ctx.protocol(); let request_gauge = metrics.connection_requests.guard(proto); - let tls = config.tls_config.as_ref(); + let tls = config.tls_config.load(); + let tls = tls.as_deref(); let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 6f9845fd6e..454fe81357 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -46,7 +46,8 @@ pub async fn init() -> anyhow::Result { .expect("this should be a valid filter directive"), ); - let otlp_layer = tracing_utils::init_tracing("proxy").await; + let otlp_layer = + tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default()).await; let json_log_layer = if logfmt == LogFormat::Json { Some(JsonLoggingLayer::new( diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 29834760c0..e5fc0b724b 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -30,7 +30,16 @@ pub struct Metrics { static SELF: OnceLock = OnceLock::new(); impl Metrics { pub fn install(thread_pool: Arc) { - SELF.set(Metrics::new(thread_pool)) + let mut metrics = Metrics::new(thread_pool); + + metrics.proxy.errors_total.init_all_dense(); + metrics.proxy.redis_errors_total.init_all_dense(); + metrics.proxy.redis_events_count.init_all_dense(); + metrics.proxy.retries_metric.init_all_dense(); + metrics.proxy.invalid_endpoints_total.init_all_dense(); + metrics.proxy.connection_failures_total.init_all_dense(); + + SELF.set(metrics) .ok() .expect("proxy metrics must not be installed more than once"); } diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 955f754497..2582e4c069 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -114,7 +114,7 @@ pub(crate) async fn handshake( let mut read_buf = read_buf.reader(); let mut res = Ok(()); - let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config()) + let accept = tokio_rustls::TlsAcceptor::from(tls.pg_config.clone()) .accept_with(raw, |session| { // push the early data to the tls session while !read_buf.get_ref().is_empty() { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 0c6d352600..2e7d332a8b 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -278,7 +278,8 @@ pub(crate) async fn handle_client( let proto = ctx.protocol(); let request_gauge = metrics.connection_requests.guard(proto); - let tls = config.tls_config.as_ref(); + let tls = config.tls_config.load(); + let tls = tls.as_deref(); let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index 23b9897155..c100b8d716 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -10,7 +10,7 @@ use crate::config::ComputeConfig; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; use crate::stream::Stream; -use crate::usage_metrics::{Ids, MetricCounterRecorder, TrafficDirection, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] @@ -24,7 +24,6 @@ pub(crate) async fn proxy_pass( let usage_tx = USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, - direction: TrafficDirection::Egress, private_link_id, }); @@ -47,6 +46,7 @@ pub(crate) async fn proxy_pass( |cnt| { // Number of bytes the client sent to the compute node (inbound). metrics.get_metric(m_recv).inc_by(cnt as u64); + usage_tx.record_ingress(cnt as u64); }, ); diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index e0b7539538..2c3e70138d 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -96,16 +96,18 @@ fn generate_tls_config<'a>( .with_safe_default_protocol_versions() .context("ring should support the default protocol versions")? .with_no_client_auth() - .with_single_cert(vec![cert.clone()], key.clone_key())? - .into(); + .with_single_cert(vec![cert.clone()], key.clone_key())?; let mut cert_resolver = CertResolver::new(); cert_resolver.add_cert(key, vec![cert], true)?; let common_names = cert_resolver.get_common_names(); + let config = Arc::new(config); + TlsConfig { - config, + http_config: config.clone(), + pg_config: config, common_names, cert_resolver: Arc::new(cert_resolver), } diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 933204994b..77b548cc43 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -22,7 +22,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::protocol2::ConnectionInfoExtra; use crate::types::{DbName, EndpointCacheKey, RoleName}; -use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; #[derive(Debug, Clone)] pub(crate) struct ConnInfo { @@ -639,11 +639,7 @@ impl Client { (&mut inner.inner, Discard { conn_info, pool }) } - pub(crate) fn metrics( - &self, - direction: TrafficDirection, - ctx: &RequestContext, - ) -> Arc { + pub(crate) fn metrics(&self, ctx: &RequestContext) -> Arc { let aux = &self .inner .as_ref() @@ -659,7 +655,6 @@ impl Client { USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, - direction, private_link_id, }) } diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index bca2d4c165..1c6574e57e 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -19,7 +19,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::protocol2::ConnectionInfoExtra; use crate::types::EndpointCacheKey; -use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; pub(crate) type Send = http2::SendRequest; pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; @@ -265,11 +265,7 @@ impl Client { Self { inner } } - pub(crate) fn metrics( - &self, - direction: TrafficDirection, - ctx: &RequestContext, - ) -> Arc { + pub(crate) fn metrics(&self, ctx: &RequestContext) -> Arc { let aux = &self.inner.aux; let private_link_id = match ctx.extra() { @@ -281,7 +277,6 @@ impl Client { USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, - direction, private_link_id, }) } diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index a7f46cbe58..9c11f32083 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -19,6 +19,7 @@ use std::pin::{Pin, pin}; use std::sync::Arc; use anyhow::Context; +use arc_swap::ArcSwapOption; use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; @@ -117,18 +118,7 @@ pub async fn task_main( auth_backend, endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); - let tls_acceptor: Arc = match config.tls_config.as_ref() { - Some(config) => { - let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config()); - // prefer http2, but support http/1.1 - tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; - Arc::new(tls_server_config) - } - None => { - warn!("TLS config is missing"); - Arc::new(NoTls) - } - }; + let tls_acceptor: Arc = Arc::new(&config.tls_config); let connections = tokio_util::task::task_tracker::TaskTracker::new(); connections.close(); // allows `connections.wait to complete` @@ -216,22 +206,20 @@ pub(crate) type AsyncRW = Pin>; #[async_trait] trait MaybeTlsAcceptor: Send + Sync + 'static { - async fn accept(self: Arc, conn: ChainRW) -> std::io::Result; + async fn accept(&self, conn: ChainRW) -> std::io::Result; } #[async_trait] -impl MaybeTlsAcceptor for rustls::ServerConfig { - async fn accept(self: Arc, conn: ChainRW) -> std::io::Result { - Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?)) - } -} - -struct NoTls; - -#[async_trait] -impl MaybeTlsAcceptor for NoTls { - async fn accept(self: Arc, conn: ChainRW) -> std::io::Result { - Ok(Box::pin(conn)) +impl MaybeTlsAcceptor for &'static ArcSwapOption { + async fn accept(&self, conn: ChainRW) -> std::io::Result { + match &*self.load() { + Some(config) => Ok(Box::pin( + TlsAcceptor::from(config.http_config.clone()) + .accept(conn) + .await?, + )), + None => Ok(Box::pin(conn)), + } } } @@ -449,9 +437,11 @@ async fn request_handler( let testodrome_id = request .headers() .get("X-Neon-Query-ID") - .map(|value| value.to_str().unwrap_or_default().to_string()); + .and_then(|value| value.to_str().ok()) + .map(|s| s.to_string()); if let Some(query_id) = testodrome_id { + info!(parent: &ctx.span(), "testodrome query ID: {query_id}"); ctx.set_testodrome_id(query_id); } @@ -493,6 +483,17 @@ async fn request_handler( ); let span = ctx.span(); + let testodrome_id = request + .headers() + .get("X-Neon-Query-ID") + .and_then(|value| value.to_str().ok()) + .map(|s| s.to_string()); + + if let Some(query_id) = testodrome_id { + info!(parent: &ctx.span(), "testodrome query ID: {query_id}"); + ctx.set_testodrome_id(query_id); + } + sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) .await diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 612702231f..10e378a18d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -42,7 +42,7 @@ use crate::metrics::{HttpDirection, Metrics}; use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; -use crate::usage_metrics::{MetricCounter, MetricCounterRecorder, TrafficDirection}; +use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] @@ -614,7 +614,9 @@ async fn handle_inner( &config.authentication_config, ctx, request.headers(), - config.tls_config.as_ref(), + // todo: race condition? + // we're unlikely to change the common names. + config.tls_config.load().as_deref(), )?; info!( user = conn_info.conn_info.user_info.user.as_str(), @@ -661,6 +663,7 @@ async fn handle_db_inner( let parsed_headers = HttpHeaders::try_parse(headers)?; + let mut request_len = 0; let fetch_and_process_request = Box::pin( async { let body = read_body_with_limit( @@ -669,6 +672,8 @@ async fn handle_db_inner( ) .await?; + request_len = body.len(); + Metrics::get() .proxy .http_conn_content_length_bytes @@ -763,7 +768,7 @@ async fn handle_db_inner( } }; - let metrics = client.metrics(TrafficDirection::Egress, ctx); + let metrics = client.metrics(ctx); let len = json_output.len(); let response = response @@ -779,6 +784,8 @@ async fn handle_db_inner( // count the egress bytes - we miss the TLS and header overhead but oh well... // moving this later in the stack is going to be a lot of effort and ehhhh metrics.record_egress(len as u64); + metrics.record_ingress(request_len as u64); + Metrics::get() .proxy .http_conn_content_length_bytes @@ -836,7 +843,7 @@ async fn handle_auth_broker_inner( .expect("all headers and params received via hyper should be valid for request"); // todo: map body to count egress - let _metrics = client.metrics(TrafficDirection::Egress, ctx); + let _metrics = client.metrics(ctx); Ok(client .inner @@ -860,7 +867,13 @@ impl QueryData { let cancel_token = inner.cancel_token(); let res = match select( - pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)), + pin!(query_to_json( + config, + &mut *inner, + self, + &mut 0, + parsed_headers + )), pin!(cancel.cancelled()), ) .await @@ -944,7 +957,7 @@ impl BatchQueryData { builder = builder.deferrable(true); } - let transaction = builder + let mut transaction = builder .start() .await .inspect_err(|_| { @@ -957,7 +970,7 @@ impl BatchQueryData { let json_output = match query_batch( config, cancel.child_token(), - &transaction, + &mut transaction, self, parsed_headers, ) @@ -1009,7 +1022,7 @@ impl BatchQueryData { async fn query_batch( config: &'static HttpConfig, cancel: CancellationToken, - transaction: &Transaction<'_>, + transaction: &mut Transaction<'_>, queries: BatchQueryData, parsed_headers: HttpHeaders, ) -> Result { @@ -1047,7 +1060,7 @@ async fn query_batch( async fn query_to_json( config: &'static HttpConfig, - client: &T, + client: &mut T, data: QueryData, current_size: &mut usize, parsed_headers: HttpHeaders, @@ -1160,10 +1173,10 @@ enum Discard<'a> { } impl Client { - fn metrics(&self, direction: TrafficDirection, ctx: &RequestContext) -> Arc { + fn metrics(&self, ctx: &RequestContext) -> Arc { match self { - Client::Remote(client) => client.metrics(direction, ctx), - Client::Local(local_client) => local_client.metrics(direction, ctx), + Client::Remote(client) => client.metrics(ctx), + Client::Local(local_client) => local_client.metrics(ctx), } } diff --git a/proxy/src/tls/mod.rs b/proxy/src/tls/mod.rs index d6ce6bd9fc..7fe71abf48 100644 --- a/proxy/src/tls/mod.rs +++ b/proxy/src/tls/mod.rs @@ -6,7 +6,7 @@ use anyhow::Context; use rustls::pki_types::CertificateDer; use sha2::{Digest, Sha256}; use tracing::{error, info}; -use x509_parser::oid_registry; +use x509_cert::der::{Reader, SliceReader, oid}; /// pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql"; @@ -41,27 +41,27 @@ pub enum TlsServerEndPoint { impl TlsServerEndPoint { pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result { - let sha256_oids = [ + const SHA256_OIDS: &[oid::ObjectIdentifier] = &[ // I'm explicitly not adding MD5 or SHA1 here... They're bad. - oid_registry::OID_SIG_ECDSA_WITH_SHA256, - oid_registry::OID_PKCS1_SHA256WITHRSA, + oid::db::rfc5912::ECDSA_WITH_SHA_256, + oid::db::rfc5912::SHA_256_WITH_RSA_ENCRYPTION, ]; - let pem = x509_parser::parse_x509_certificate(cert) - .context("Failed to parse PEM object from cerficiate")? - .1; + let certificate = SliceReader::new(cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; - info!(subject = %pem.subject, "parsing TLS certificate"); + let subject = certificate.tbs_certificate.subject; + info!(%subject, "parsing TLS certificate"); - let reg = oid_registry::OidRegistry::default().with_all_crypto(); - let oid = pem.signature_algorithm.oid(); - let alg = reg.get(oid); - if sha256_oids.contains(oid) { + let oid = certificate.signature_algorithm.oid; + if SHA256_OIDS.contains(&oid) { let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); - info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); + info!(%subject, tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); Ok(Self::Sha256(tls_server_end_point)) } else { - error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding"); + error!(%subject, "unknown channel binding"); Ok(Self::Undefined) } } diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 903c0b712b..eab9940e7d 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -5,21 +5,19 @@ use anyhow::{Context, bail}; use itertools::Itertools; use rustls::crypto::ring::{self, sign}; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; +use x509_cert::der::{Reader, SliceReader}; use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; pub struct TlsConfig { - pub config: Arc, + // unfortunate split since we cannot change the ALPN on demand. + // + pub http_config: Arc, + pub pg_config: Arc, pub common_names: HashSet, pub cert_resolver: Arc, } -impl TlsConfig { - pub fn to_server_config(&self) -> Arc { - self.config.clone() - } -} - /// Configure TLS for the main endpoint. pub fn configure_tls( key_path: &str, @@ -71,8 +69,15 @@ pub fn configure_tls( config.key_log = Arc::new(rustls::KeyLogFile::new()); } + let mut http_config = config.clone(); + let mut pg_config = config; + + http_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; + pg_config.alpn_protocols = vec![b"postgresql".to_vec()]; + Ok(TlsConfig { - config: Arc::new(config), + http_config: Arc::new(http_config), + pg_config: Arc::new(pg_config), common_names, cert_resolver, }) @@ -127,11 +132,13 @@ impl CertResolver { let first_cert = &cert_chain[0]; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let pem = x509_parser::parse_x509_certificate(first_cert) - .context("Failed to parse PEM object from cerficiate")? - .1; - let common_name = pem.subject().to_string(); + let certificate = SliceReader::new(first_cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; + + let common_name = certificate.tbs_certificate.subject.to_string(); // We need to get the canonical name for this certificate so we can match them against any domain names // seen within the proxy codebase. diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 004d268fa1..2b27dc5c76 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -44,11 +44,17 @@ const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60); pub(crate) struct Ids { pub(crate) endpoint_id: EndpointIdInt, pub(crate) branch_id: BranchIdInt, - pub(crate) direction: TrafficDirection, #[serde(with = "none_as_empty_string")] pub(crate) private_link_id: Option, } +#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] +struct Extra { + #[serde(flatten)] + ids: Ids, + direction: TrafficDirection, +} + mod none_as_empty_string { use serde::Deserialize; use smol_str::SmolStr; @@ -76,18 +82,23 @@ pub(crate) enum TrafficDirection { pub(crate) trait MetricCounterRecorder { /// Record that some bytes were sent from the proxy to the client fn record_egress(&self, bytes: u64); + + /// Record that some bytes were sent from the client to the proxy + fn record_ingress(&self, bytes: u64); + /// Record that some connections were opened fn record_connection(&self, count: usize); } trait MetricCounterReporter { - fn get_metrics(&mut self) -> (u64, usize); - fn move_metrics(&self) -> (u64, usize); + fn get_metrics(&mut self) -> MetricsData; + fn move_metrics(&self) -> MetricsData; } #[derive(Debug)] pub(crate) struct MetricCounter { transmitted: AtomicU64, + received: AtomicU64, opened_connections: AtomicUsize, } @@ -97,6 +108,11 @@ impl MetricCounterRecorder for MetricCounter { self.transmitted.fetch_add(bytes, Ordering::Relaxed); } + /// Record that some bytes were sent from the proxy to the client + fn record_ingress(&self, bytes: u64) { + self.received.fetch_add(bytes, Ordering::Relaxed); + } + /// Record that some connections were opened fn record_connection(&self, count: usize) { self.opened_connections.fetch_add(count, Ordering::Relaxed); @@ -104,29 +120,43 @@ impl MetricCounterRecorder for MetricCounter { } impl MetricCounterReporter for MetricCounter { - fn get_metrics(&mut self) -> (u64, usize) { - ( - *self.transmitted.get_mut(), - *self.opened_connections.get_mut(), - ) + fn get_metrics(&mut self) -> MetricsData { + MetricsData { + received: *self.received.get_mut(), + transmitted: *self.transmitted.get_mut(), + connections: *self.opened_connections.get_mut(), + } } - fn move_metrics(&self) -> (u64, usize) { - ( - self.transmitted.swap(0, Ordering::Relaxed), - self.opened_connections.swap(0, Ordering::Relaxed), - ) + + fn move_metrics(&self) -> MetricsData { + MetricsData { + received: self.received.swap(0, Ordering::Relaxed), + transmitted: self.transmitted.swap(0, Ordering::Relaxed), + connections: self.opened_connections.swap(0, Ordering::Relaxed), + } } } +struct MetricsData { + transmitted: u64, + received: u64, + connections: usize, +} + +struct BytesSent { + transmitted: u64, + received: u64, +} + trait Clearable { /// extract the value that should be reported - fn should_report(self: &Arc) -> Option; + fn should_report(self: &Arc) -> Option; /// Determine whether the counter should be cleared from the global map. fn should_clear(self: &mut Arc) -> bool; } impl Clearable for C { - fn should_report(self: &Arc) -> Option { + fn should_report(self: &Arc) -> Option { // heuristic to see if the branch is still open // if a clone happens while we are observing, the heuristic will be incorrect. // @@ -139,14 +169,21 @@ impl Clearable for C { // (to avoid sending the same metrics twice) // see the relevant discussion on why to do so even if the status is not success: // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 - let (value, opened) = self.move_metrics(); + let MetricsData { + transmitted, + received, + connections, + } = self.move_metrics(); // Our only requirement is that we report in every interval if there was an open connection // if there were no opened connections since, then we don't need to report - if value == 0 && !is_open && opened == 0 { + if transmitted == 0 && received == 0 && !is_open && connections == 0 { None } else { - Some(value) + Some(BytesSent { + transmitted, + received, + }) } } fn should_clear(self: &mut Arc) -> bool { @@ -154,9 +191,13 @@ impl Clearable for C { let Some(counter) = Arc::get_mut(self) else { return false; }; - let (opened, value) = counter.get_metrics(); + let MetricsData { + transmitted, + received, + connections, + } = counter.get_metrics(); // clear if there's no data to report - value == 0 && opened == 0 + transmitted == 0 && received == 0 && connections == 0 } } @@ -178,6 +219,7 @@ impl Metrics { .entry(ids) .or_insert_with(|| { Arc::new(MetricCounter { + received: AtomicU64::new(0), transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), }) @@ -242,10 +284,10 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result( endpoints: &ClashMap, FastHasher>, -) -> Vec<(Ids, u64)> { +) -> Vec<(Ids, BytesSent)> { let mut metrics_to_clear = Vec::new(); - let metrics_to_send: Vec<(Ids, u64)> = endpoints + let metrics_to_send: Vec<(Ids, BytesSent)> = endpoints .iter() .filter_map(|counter| { let key = counter.key().clone(); @@ -271,26 +313,46 @@ fn collect_and_clear_metrics( } fn create_event_chunks<'a>( - metrics_to_send: &'a [(Ids, u64)], + metrics_to_send: &'a [(Ids, BytesSent)], hostname: &'a str, prev: DateTime, now: DateTime, chunk_size: usize, -) -> impl Iterator>> + 'a { +) -> impl Iterator>> + 'a { metrics_to_send .chunks(chunk_size) .map(move |chunk| EventChunk { events: chunk .iter() - .map(|(ids, value)| Event { - kind: EventType::Incremental { - start_time: prev, - stop_time: now, - }, - metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname), - value: *value, - extra: ids.clone(), + .flat_map(|(ids, bytes)| { + [ + Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: bytes.transmitted, + extra: Extra { + ids: ids.clone(), + direction: TrafficDirection::Egress, + }, + }, + Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: bytes.received, + extra: Extra { + ids: ids.clone(), + direction: TrafficDirection::Ingress, + }, + }, + ] }) .collect(), }) @@ -350,7 +412,7 @@ fn create_remote_path_prefix(now: DateTime) -> String { async fn upload_main_events_chunked( client: &http::ClientWithMiddleware, metric_collection_endpoint: &reqwest::Url, - chunk: &EventChunk<'_, Event>, + chunk: &EventChunk<'_, Event>, subchunk_size: usize, ) { // Split into smaller chunks to avoid exceeding the max request size @@ -384,7 +446,7 @@ async fn upload_main_events_chunked( async fn upload_backup_events( storage: Option<&GenericRemoteStorage>, - chunk: &EventChunk<'_, Event>, + chunk: &EventChunk<'_, Event>, path_prefix: &str, cancel: &CancellationToken, ) -> anyhow::Result<()> { @@ -461,7 +523,7 @@ mod tests { #[tokio::test] async fn metrics() { - type Report = EventChunk<'static, Event>; + type Report = EventChunk<'static, Event>; let reports: Arc>> = Arc::default(); let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); @@ -533,7 +595,6 @@ mod tests { let counter = metrics.register(Ids { endpoint_id: (&EndpointId::from("e1")).into(), branch_id: (&BranchId::from("b1")).into(), - direction: TrafficDirection::Egress, private_link_id: None, }); @@ -551,13 +612,19 @@ mod tests { .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); - assert_eq!(r[0].events.len(), 1); + assert_eq!(r[0].events.len(), 2); assert_eq!(r[0].events[0].value, 0); + assert_eq!(r[0].events[0].extra.direction, TrafficDirection::Egress); + assert_eq!(r[0].events[1].value, 0); + assert_eq!(r[0].events[1].extra.direction, TrafficDirection::Ingress); pushed_chunks.extend(r); // record egress counter.record_egress(1); + // record ingress + counter.record_ingress(2); + // egress should be observered collect_metrics_iteration( &metrics.endpoints, @@ -572,8 +639,11 @@ mod tests { .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); - assert_eq!(r[0].events.len(), 1); + assert_eq!(r[0].events.len(), 2); assert_eq!(r[0].events[0].value, 1); + assert_eq!(r[0].events[0].extra.direction, TrafficDirection::Egress); + assert_eq!(r[0].events[1].value, 2); + assert_eq!(r[0].events[1].extra.direction, TrafficDirection::Ingress); pushed_chunks.extend(r); // release counter diff --git a/pyproject.toml b/pyproject.toml index e7f5c62bd0..e009b0773e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,8 +48,8 @@ types-jwcrypto = "^1.5.0.20240925" pyyaml = "^6.0.2" types-pyyaml = "^6.0.12.20240917" testcontainers = "^4.9.0" -# Jsonnet doesn't support Python 3.13 yet -jsonnet = { version = "^0.20.0", markers = "python_version < '3.13'" } +# Install a release candidate of `jsonnet`, as it supports Python 3.13 +jsonnet = "^0.21.0-rc2" [tool.poetry.group.dev.dependencies] mypy = "==1.13.0" diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index bb937ad56a..965aa7504b 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -35,8 +35,9 @@ postgres-protocol.workspace = true pprof.workspace = true rand.workspace = true regex.workspace = true -scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } +rustls.workspace = true +scopeguard.workspace = true serde.workspace = true serde_json.workspace = true smallvec.workspace = true @@ -45,10 +46,11 @@ strum_macros.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["fs"] } -tokio-util = { workspace = true } tokio-io-timeout.workspace = true tokio-postgres.workspace = true +tokio-rustls.workspace = true tokio-tar.workspace = true +tokio-util = { workspace = true } tracing.workspace = true url.workspace = true metrics.workspace = true diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 3966aa811f..424cd89221 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -8,7 +8,7 @@ use std::error::Error as _; use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, StatusCode}; use safekeeper_api::models::{ - PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, TimelineStatus, }; use utils::id::{NodeId, TenantId, TimelineId}; @@ -81,13 +81,10 @@ impl Client { } } - pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result { - let uri = format!( - "{}/v1/tenant/{}/timeline/{}", - self.mgmt_api_endpoint, req.tenant_id, req.timeline_id - ); + pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result { + let uri = format!("{}/v1/tenant/timeline", self.mgmt_api_endpoint); let resp = self.post(&uri, req).await?; - resp.json().await.map_err(Error::ReceiveBody) + Ok(resp) } pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result { @@ -96,11 +93,25 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn exclude_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/exclude", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.put(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_timeline( &self, tenant_id: TenantId, timeline_id: TimelineId, - ) -> Result { + ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}", self.mgmt_api_endpoint, tenant_id, timeline_id @@ -109,6 +120,20 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn bump_timeline_term( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineTermBumpRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/term_bump", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.post(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn timeline_status( &self, tenant_id: TenantId, @@ -149,6 +174,14 @@ impl Client { self.request(Method::POST, uri, body).await } + async fn put( + &self, + uri: U, + body: B, + ) -> Result { + self.request(Method::PUT, uri, body).await + } + async fn get(&self, uri: U) -> Result { self.request(Method::GET, uri, ()).await } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 10fc4a4b59..9ca79de179 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -16,10 +16,12 @@ use futures::stream::FuturesUnordered; use futures::{FutureExt, StreamExt}; use metrics::set_build_info_metric; use remote_storage::RemoteStorageConfig; +use reqwest::Certificate; use safekeeper::defaults::{ DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, - DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, + DEFAULT_SSL_KEY_FILE, }; use safekeeper::{ BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, WAL_SERVICE_RUNTIME, broker, @@ -94,6 +96,9 @@ struct Args { /// Listen http endpoint for management and metrics in the form host:port. #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)] listen_http: String, + /// Listen https endpoint for management and metrics in the form host:port. + #[arg(long, default_value = None)] + listen_https: Option, /// Advertised endpoint for receiving/sending WAL in the form host:port. If not /// specified, listen_pg is used to advertise instead. #[arg(long, default_value = None)] @@ -203,6 +208,15 @@ struct Args { /// and the current position of the reader is smaller than this value. #[arg(long)] max_delta_for_fanout: Option, + /// Path to a file with certificate's private key for https API. + #[arg(long, default_value = DEFAULT_SSL_KEY_FILE)] + ssl_key_file: Utf8PathBuf, + /// Path to a file with a X509 certificate for https API. + #[arg(long, default_value = DEFAULT_SSL_CERT_FILE)] + ssl_cert_file: Utf8PathBuf, + /// Trusted root CA certificate to use in https APIs. + #[arg(long)] + ssl_ca_file: Option, } // Like PathBufValueParser, but allows empty string. @@ -336,12 +350,22 @@ async fn main() -> anyhow::Result<()> { } }; + let ssl_ca_cert = match args.ssl_ca_file.as_ref() { + Some(ssl_ca_file) => { + tracing::info!("Using ssl root CA file: {ssl_ca_file:?}"); + let buf = tokio::fs::read(ssl_ca_file).await?; + Some(Certificate::from_pem(&buf)?) + } + None => None, + }; + let conf = Arc::new(SafeKeeperConf { workdir, my_id: id, listen_pg_addr: args.listen_pg, listen_pg_addr_tenant_only: args.listen_pg_tenant_only, listen_http_addr: args.listen_http, + listen_https_addr: args.listen_https, advertise_pg_addr: args.advertise_pg, availability_zone: args.availability_zone, no_sync: args.no_sync, @@ -368,6 +392,9 @@ async fn main() -> anyhow::Result<()> { eviction_min_resident: args.eviction_min_resident, wal_reader_fanout: args.wal_reader_fanout, max_delta_for_fanout: args.max_delta_for_fanout, + ssl_key_file: args.ssl_key_file, + ssl_cert_file: args.ssl_cert_file, + ssl_ca_cert, }); // initialize sentry if SENTRY_DSN is provided @@ -428,6 +455,17 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { e })?; + let https_listener = match conf.listen_https_addr.as_ref() { + Some(listen_https_addr) => { + info!("starting safekeeper HTTPS service on {}", listen_https_addr); + Some(tcp_listener::bind(listen_https_addr).map_err(|e| { + error!("failed to bind to address {}: {}", listen_https_addr, e); + e + })?) + } + None => None, + }; + let global_timelines = Arc::new(GlobalTimelines::new(conf.clone())); // Register metrics collector for active timelines. It's important to do this @@ -501,7 +539,7 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { let http_handle = current_thread_rt .as_ref() .unwrap_or_else(|| HTTP_RUNTIME.handle()) - .spawn(http::task_main( + .spawn(http::task_main_http( conf.clone(), http_listener, global_timelines.clone(), @@ -509,6 +547,19 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { .map(|res| ("HTTP service main".to_owned(), res)); tasks_handles.push(Box::pin(http_handle)); + if let Some(https_listener) = https_listener { + let https_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| HTTP_RUNTIME.handle()) + .spawn(http::task_main_https( + conf.clone(), + https_listener, + global_timelines.clone(), + )) + .map(|res| ("HTTPS service main".to_owned(), res)); + tasks_handles.push(Box::pin(https_handle)); + } + let broker_task_handle = current_thread_rt .as_ref() .unwrap_or_else(|| BROKER_RUNTIME.handle()) diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index f162985ef7..4908863a4b 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -3,10 +3,11 @@ use std::sync::Arc; pub use routes::make_router; pub use safekeeper_api::models; +use tokio_util::sync::CancellationToken; use crate::{GlobalTimelines, SafeKeeperConf}; -pub async fn task_main( +pub async fn task_main_http( conf: Arc, http_listener: std::net::TcpListener, global_timelines: Arc, @@ -14,8 +15,37 @@ pub async fn task_main( let router = make_router(conf, global_timelines) .build() .map_err(|err| anyhow::anyhow!(err))?; - let service = http_utils::RouterService::new(router).unwrap(); - let server = hyper::Server::from_tcp(http_listener)?; - server.serve(service).await?; + + let service = Arc::new( + http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow::anyhow!(err))?, + ); + let server = http_utils::server::Server::new(service, http_listener, None)?; + server.serve(CancellationToken::new()).await?; + Ok(()) // unreachable +} + +pub async fn task_main_https( + conf: Arc, + https_listener: std::net::TcpListener, + global_timelines: Arc, +) -> anyhow::Result<()> { + let certs = http_utils::tls_certs::load_cert_chain(&conf.ssl_cert_file)?; + let key = http_utils::tls_certs::load_private_key(&conf.ssl_key_file)?; + + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_single_cert(certs, key)?; + + let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + + let router = make_router(conf, global_timelines) + .build() + .map_err(|err| anyhow::anyhow!(err))?; + + let service = Arc::new( + http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow::anyhow!(err))?, + ); + let server = http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?; + server.serve(CancellationToken::new()).await?; Ok(()) // unreachable } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 4f47331c85..3299d77545 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -17,7 +17,8 @@ use hyper::{Body, Request, Response, StatusCode}; use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::{ AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry, - TimelineCopyRequest, TimelineCreateRequest, TimelineStatus, TimelineTermBumpRequest, + TimelineCopyRequest, TimelineCreateRequest, TimelineDeleteResult, TimelineStatus, + TimelineTermBumpRequest, }; use safekeeper_api::{ServerInfo, membership, models}; use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; @@ -32,7 +33,7 @@ use utils::lsn::Lsn; use crate::debug_dump::TimelineDigestRequest; use crate::safekeeper::TermLsn; -use crate::timelines_global_map::{DeleteOrExclude, TimelineDeleteResult}; +use crate::timelines_global_map::DeleteOrExclude; use crate::{ GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline, }; @@ -231,9 +232,14 @@ async fn timeline_pull_handler(mut request: Request) -> Result, pub listen_http_addr: String, + pub listen_https_addr: Option, pub advertise_pg_addr: Option, pub availability_zone: Option, pub no_sync: bool, @@ -111,6 +116,9 @@ pub struct SafeKeeperConf { pub eviction_min_resident: Duration, pub wal_reader_fanout: bool, pub max_delta_for_fanout: Option, + pub ssl_key_file: Utf8PathBuf, + pub ssl_cert_file: Utf8PathBuf, + pub ssl_ca_cert: Option, } impl SafeKeeperConf { @@ -127,6 +135,7 @@ impl SafeKeeperConf { listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_pg_addr_tenant_only: None, listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), + listen_https_addr: None, advertise_pg_addr: None, availability_zone: None, remote_storage: None, @@ -155,6 +164,9 @@ impl SafeKeeperConf { eviction_min_resident: Duration::ZERO, wal_reader_fanout: false, max_delta_for_fanout: None, + ssl_key_file: Utf8PathBuf::from(defaults::DEFAULT_SSL_KEY_FILE), + ssl_cert_file: Utf8PathBuf::from(defaults::DEFAULT_SSL_CERT_FILE), + ssl_ca_cert: None, } } } diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 7d6ce1269c..dab8142dfb 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -8,6 +8,7 @@ use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; +use reqwest::Certificate; use safekeeper_api::Term; use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}; use safekeeper_client::mgmt_api; @@ -392,6 +393,7 @@ pub struct DebugDumpResponse { pub async fn handle_request( request: PullTimelineRequest, sk_auth_token: Option, + ssl_ca_cert: Option, global_timelines: Arc, ) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( @@ -402,9 +404,11 @@ pub async fn handle_request( bail!("Timeline {} already exists", request.timeline_id); } - // TODO(DimasKovas): add ssl root CA certificate when implementing safekeeper's - // part of https support (#24836). - let http_client = reqwest::Client::new(); + let mut http_client = reqwest::Client::builder(); + if let Some(ssl_ca_cert) = ssl_ca_cert { + http_client = http_client.add_root_certificate(ssl_ca_cert); + } + let http_client = http_client.build()?; let http_hosts = request.http_hosts.clone(); @@ -441,13 +445,21 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); - pull_timeline(status, safekeeper_host, sk_auth_token, global_timelines).await + pull_timeline( + status, + safekeeper_host, + sk_auth_token, + http_client, + global_timelines, + ) + .await } async fn pull_timeline( status: TimelineStatus, host: String, sk_auth_token: Option, + http_client: reqwest::Client, global_timelines: Arc, ) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); @@ -464,9 +476,6 @@ async fn pull_timeline( let conf = &global_timelines.get_global_config(); let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; - // TODO(DimasKovas): add ssl root CA certificate when implementing safekeeper's - // part of https support (#24836). - let http_client = reqwest::Client::new(); let client = Client::new(http_client, host.clone(), sk_auth_token.clone()); // Request stream with basebackup archive. let bb_resp = client diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 858dfce807..41abee369e 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -11,9 +11,8 @@ use anyhow::{Context, Result, bail}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use safekeeper_api::membership::Configuration; -use safekeeper_api::models::SafekeeperUtilization; +use safekeeper_api::models::{SafekeeperUtilization, TimelineDeleteResult}; use safekeeper_api::{ServerInfo, membership}; -use serde::Serialize; use tokio::fs; use tracing::*; use utils::crashsafe::{durable_rename, fsync_async_opt}; @@ -579,11 +578,6 @@ impl GlobalTimelines { } } -#[derive(Clone, Copy, Serialize)] -pub struct TimelineDeleteResult { - pub dir_existed: bool, -} - /// Action for delete_or_exclude. #[derive(Clone, Debug)] pub enum DeleteOrExclude { diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 6ce1a9940e..0dfdafcc51 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -152,6 +152,7 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { my_id: NodeId(os.id() as u64), listen_pg_addr: String::new(), listen_http_addr: String::new(), + listen_https_addr: None, no_sync: false, broker_endpoint: "/".parse::().unwrap(), broker_keepalive_interval: Duration::from_secs(0), @@ -179,6 +180,9 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { eviction_min_resident: Duration::ZERO, wal_reader_fanout: false, max_delta_for_fanout: None, + ssl_key_file: Utf8PathBuf::from(""), + ssl_cert_file: Utf8PathBuf::from(""), + ssl_ca_cert: None, }; let mut global = GlobalMap::new(disk, conf.clone())?; diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index b602af362d..5ce4d63d77 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -624,7 +624,16 @@ impl ComputeHook { MaybeSendResult::Transmit((request, lock)) => (request, lock), }; - let result = if let Some(notify_url) = &self.config.compute_hook_url { + let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url { + Some(if control_plane_url.ends_with('/') { + format!("{control_plane_url}notify-attach") + } else { + format!("{control_plane_url}/notify-attach") + }) + } else { + self.config.compute_hook_url.clone() + }; + let result = if let Some(notify_url) = &compute_hook_url { self.do_notify(notify_url, &request, cancel).await } else { self.do_notify_local(&request).await.map_err(|e| { diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index b27804d820..52e3ef5b0a 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -24,7 +24,7 @@ use pageserver_api::controller_api::{ ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, }; use pageserver_api::models::{ - TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, + DetachBehavior, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, }; @@ -525,6 +525,7 @@ async fn handle_tenant_timeline_detach_ancestor( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + let behavior: Option = parse_query_param(&req, "detach_behavior")?; check_permissions(&req, Scope::PageServerApi)?; maybe_rate_limit(&req, tenant_id).await; @@ -537,7 +538,7 @@ async fn handle_tenant_timeline_detach_ancestor( }; let res = service - .tenant_timeline_detach_ancestor(tenant_id, timeline_id) + .tenant_timeline_detach_ancestor(tenant_id, timeline_id, behavior) .await?; json_response(StatusCode::OK, res) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 46ac1cd7ca..6e3c70c42b 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -71,6 +71,10 @@ struct Cli { #[arg(long)] compute_hook_url: Option, + /// URL to control plane storage API prefix + #[arg(long)] + control_plane_url: Option, + /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller #[arg(long)] database_url: Option, @@ -313,11 +317,13 @@ async fn async_main() -> anyhow::Result<()> { "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" ); } - StrictMode::Strict if args.compute_hook_url.is_none() => { - // Production systems should always have a compute hook set, to prevent falling + StrictMode::Strict + if args.compute_hook_url.is_none() && args.control_plane_url.is_none() => + { + // Production systems should always have a control plane URL set, to prevent falling // back to trying to use neon_local. anyhow::bail!( - "`--compute-hook-url` is not set: this is only permitted in `--dev` mode" + "neither `--compute-hook-url` nor `--control-plane-url` are set: this is only permitted in `--dev` mode" ); } StrictMode::Strict => { @@ -343,6 +349,7 @@ async fn async_main() -> anyhow::Result<()> { control_plane_jwt_token: secrets.control_plane_jwt_token, peer_jwt_token: secrets.peer_jwt_token, compute_hook_url: args.compute_hook_url, + control_plane_url: args.control_plane_url, max_offline_interval: args .max_offline_interval .map(humantime::Duration::into) diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 7fd4f37e7e..05e7aa88c6 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,9 +1,9 @@ use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::models::{ - LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, - TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, - TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, - TopTenantShardsRequest, TopTenantShardsResponse, + DetachBehavior, LocationConfig, LocationConfigListResponse, PageserverUtilization, + SecondaryProgress, TenantScanRemoteStorageResponse, TenantShardSplitRequest, + TenantShardSplitResponse, TenantWaitLsnRequest, TimelineArchivalConfigRequest, + TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, }; use pageserver_api::shard::TenantShardId; use pageserver_client::BlockUnblock; @@ -252,13 +252,14 @@ impl PageserverClient { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + behavior: Option, ) -> Result { measured_request!( "timeline_detach_ancestor", crate::metrics::Method::Put, &self.node_id_label, self.inner - .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .timeline_detach_ancestor(tenant_shard_id, timeline_id, behavior) .await ) } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 5146fe472e..85d9c574a1 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -967,10 +967,26 @@ impl Persistence { &self, split_tenant_id: TenantId, old_shard_count: ShardCount, + new_shard_count: ShardCount, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| { Box::pin(async move { + // Sanity: child shards must still exist, as we're deleting parent shards + let child_shards_query = tenant_shards + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)); + let child_shards = child_shards_query + .load::(conn) + .await?; + if child_shards.len() != new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected child shard count {} while completing split to \ + count {new_shard_count:?} on tenant {split_tenant_id}", + child_shards.len() + ))); + } + // Drop parent shards diesel::delete(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) @@ -1613,23 +1629,49 @@ pub(crate) struct TenantShardPersistence { } impl TenantShardPersistence { + fn get_shard_count(&self) -> Result { + self.shard_count + .try_into() + .map(ShardCount) + .map_err(|_| ShardConfigError::InvalidCount) + } + + fn get_shard_number(&self) -> Result { + self.shard_number + .try_into() + .map(ShardNumber) + .map_err(|_| ShardConfigError::InvalidNumber) + } + + fn get_stripe_size(&self) -> Result { + self.shard_stripe_size + .try_into() + .map(ShardStripeSize) + .map_err(|_| ShardConfigError::InvalidStripeSize) + } + pub(crate) fn get_shard_identity(&self) -> Result { if self.shard_count == 0 { - Ok(ShardIdentity::unsharded()) + // NB: carry over the stripe size from the persisted record, to avoid consistency check + // failures if the persisted value differs from the default stripe size. The stripe size + // doesn't really matter for unsharded tenants anyway. + Ok(ShardIdentity::unsharded_with_stripe_size( + self.get_stripe_size()?, + )) } else { Ok(ShardIdentity::new( - ShardNumber(self.shard_number as u8), - ShardCount::new(self.shard_count as u8), - ShardStripeSize(self.shard_stripe_size as u32), + self.get_shard_number()?, + self.get_shard_count()?, + self.get_stripe_size()?, )?) } } - pub(crate) fn get_tenant_shard_id(&self) -> Result { + pub(crate) fn get_tenant_shard_id(&self) -> anyhow::Result { Ok(TenantShardId { tenant_id: TenantId::from_str(self.tenant_id.as_str())?, - shard_number: ShardNumber(self.shard_number as u8), - shard_count: ShardCount::new(self.shard_count as u8), + shard_number: self.get_shard_number()?, + shard_count: self.get_shard_count()?, }) } } diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index 1533b6c086..b30237e404 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -1,6 +1,5 @@ use safekeeper_api::models::{ - PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, - TimelineStatus, + self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, }; use safekeeper_client::mgmt_api::{Client, Result}; use utils::id::{NodeId, TenantId, TimelineId}; @@ -60,7 +59,7 @@ impl SafekeeperClient { pub(crate) async fn create_timeline( &self, req: &TimelineCreateRequest, - ) -> Result { + ) -> Result { measured_request!( "create_timeline", crate::metrics::Method::Post, @@ -69,11 +68,28 @@ impl SafekeeperClient { ) } + #[allow(unused)] + pub(crate) async fn exclude_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + measured_request!( + "exclude_timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .exclude_timeline(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn delete_timeline( &self, tenant_id: TenantId, timeline_id: TimelineId, - ) -> Result { + ) -> Result { measured_request!( "delete_timeline", crate::metrics::Method::Delete, @@ -94,6 +110,23 @@ impl SafekeeperClient { ) } + #[allow(unused)] + pub(crate) async fn bump_timeline_term( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineTermBumpRequest, + ) -> Result { + measured_request!( + "term_bump", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .bump_timeline_term(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index a06748abc6..61a6c12f47 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,6 +1,7 @@ pub mod chaos_injector; mod context_iterator; pub(crate) mod safekeeper_reconciler; +mod safekeeper_service; use std::borrow::Cow; use std::cmp::Ordering; @@ -27,15 +28,14 @@ use itertools::Itertools; use pageserver_api::controller_api::{ AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, - SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, - ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, - TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, - TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, - TenantShardMigrateResponse, + ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, + TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, + TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, + TenantShardMigrateRequest, TenantShardMigrateResponse, }; use pageserver_api::models::{ - self, LocationConfig, LocationConfigListResponse, LocationConfigMode, PageserverUtilization, - SafekeeperInfo, SafekeepersInfo, SecondaryProgress, ShardParameters, TenantConfig, + self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, + PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, TenantSorting, TenantTimeTravelRequest, @@ -51,18 +51,15 @@ use pageserver_api::upcall_api::{ }; use pageserver_client::{BlockUnblock, mgmt_api}; use reqwest::{Certificate, StatusCode}; -use safekeeper_api::membership::{MemberSet, SafekeeperId}; use safekeeper_api::models::SafekeeperUtilization; -use safekeeper_reconciler::{SafekeeperReconcilers, ScheduleRequest}; +use safekeeper_reconciler::SafekeeperReconcilers; use tokio::sync::TryAcquireError; use tokio::sync::mpsc::error::TrySendError; -use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; -use utils::logging::SecretString; use utils::sync::gate::Gate; use utils::{failpoint_support, pausable_failpoint}; @@ -83,8 +80,8 @@ use crate::peer_client::GlobalObservedState; use crate::persistence::split_state::SplitState; use crate::persistence::{ AbortShardSplitStatus, ControllerPersistence, DatabaseError, DatabaseResult, - MetadataHealthPersistence, Persistence, SafekeeperTimelineOpKind, ShardGenerationState, - TenantFilter, TenantShardPersistence, TimelinePendingOpPersistence, TimelinePersistence, + MetadataHealthPersistence, Persistence, ShardGenerationState, TenantFilter, + TenantShardPersistence, }; use crate::reconciler::{ ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, ReconcilerPriority, @@ -363,6 +360,15 @@ pub struct Config { /// assume it is running in a test environment and try to update neon_local. pub compute_hook_url: Option, + /// Prefix for storage API endpoints of the control plane. We use this prefix to compute + /// URLs that we use to send pageserver and safekeeper attachment locations. + /// If this is None, the compute hook will assume it is running in a test environment + /// and try to invoke neon_local instead. + /// + /// For now, there is also `compute_hook_url` which allows configuration of the pageserver + /// specific endpoint, but it is in the process of being phased out. + pub control_plane_url: Option, + /// Grace period within which a pageserver does not respond to heartbeats, but is still /// considered active. Once the grace period elapses, the next heartbeat failure will /// mark the pagseserver offline. @@ -855,11 +861,9 @@ impl Service { }; tracing::info!("Sending initial heartbeats..."); - // Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime - const SK_TIMEOUT: Duration = Duration::from_secs(5); let (res_ps, res_sk) = tokio::join!( self.heartbeater_ps.heartbeat(Arc::new(nodes_to_heartbeat)), - tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)) + self.heartbeater_sk.heartbeat(all_sks) ); let mut online_nodes = HashMap::new(); @@ -878,7 +882,7 @@ impl Service { } let mut online_sks = HashMap::new(); - if let Ok(Ok(deltas)) = res_sk { + if let Ok(deltas) = res_sk { for (node_id, status) in deltas.0 { match status { SafekeeperState::Available { @@ -1114,10 +1118,9 @@ impl Service { locked.safekeepers.clone() }; - const SK_TIMEOUT: Duration = Duration::from_secs(3); let (res_ps, res_sk) = tokio::join!( self.heartbeater_ps.heartbeat(nodes), - tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(safekeepers)) + self.heartbeater_sk.heartbeat(safekeepers) ); if let Ok(deltas) = res_ps { @@ -1221,7 +1224,7 @@ impl Service { } } } - if let Ok(Ok(deltas)) = res_sk { + if let Ok(deltas) = res_sk { let mut locked = self.inner.write().unwrap(); let mut safekeepers = (*locked.safekeepers).clone(); for (id, state) in deltas.0 { @@ -1995,21 +1998,41 @@ impl Service { tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len()); let mut cleanup = Vec::new(); + let mut mismatched_locations = 0; { let mut locked = self.inner.write().unwrap(); - for (tenant_shard_id, observed_loc) in configs.tenant_shards { + for (tenant_shard_id, reported) in configs.tenant_shards { let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { cleanup.push(tenant_shard_id); continue; }; - tenant_shard + + let on_record = &mut tenant_shard .observed .locations - .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); + .entry(node.get_id()) + .or_insert_with(|| ObservedStateLocation { conf: None }) + .conf; + + // If the location reported by the node does not match our observed state, + // then we mark it as uncertain and let the background reconciliation loop + // deal with it. + // + // Note that this also covers net new locations reported by the node. + if *on_record != reported { + mismatched_locations += 1; + *on_record = None; + } } } + if mismatched_locations > 0 { + tracing::info!( + "Set observed state to None for {mismatched_locations} mismatched locations" + ); + } + for tenant_shard_id in cleanup { tracing::info!("Detaching {tenant_shard_id}"); match node @@ -3620,281 +3643,6 @@ impl Service { .await? } - /// Timeline creation on safekeepers - /// - /// Returns `Ok(left)` if the timeline has been created on a quorum of safekeepers, - /// where `left` contains the list of safekeepers that didn't have a successful response. - /// Assumes tenant lock is held while calling this function. - async fn tenant_timeline_create_safekeepers_quorum( - &self, - tenant_id: TenantId, - timeline_id: TimelineId, - pg_version: u32, - timeline_persistence: &TimelinePersistence, - ) -> Result, ApiError> { - // If quorum is reached, return if we are outside of a specified timeout - let jwt = self - .config - .safekeeper_jwt_token - .clone() - .map(SecretString::from); - let mut joinset = JoinSet::new(); - - let safekeepers = { - let locked = self.inner.read().unwrap(); - locked.safekeepers.clone() - }; - - let mut members = Vec::new(); - for sk_id in timeline_persistence.sk_set.iter() { - let sk_id = NodeId(*sk_id as u64); - let Some(safekeeper) = safekeepers.get(&sk_id) else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find entry for safekeeper with id {sk_id}" - )))?; - }; - members.push(SafekeeperId { - id: sk_id, - host: safekeeper.skp.host.clone(), - pg_port: safekeeper.skp.port as u16, - }); - } - let mset = MemberSet::new(members).map_err(ApiError::InternalServerError)?; - let mconf = safekeeper_api::membership::Configuration::new(mset); - - let req = safekeeper_api::models::TimelineCreateRequest { - commit_lsn: None, - mconf, - pg_version, - start_lsn: timeline_persistence.start_lsn.0, - system_id: None, - tenant_id, - timeline_id, - wal_seg_size: None, - }; - const SK_CREATE_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); - for sk in timeline_persistence.sk_set.iter() { - let sk_id = NodeId(*sk as u64); - let safekeepers = safekeepers.clone(); - let jwt = jwt.clone(); - let ssl_ca_cert = self.config.ssl_ca_cert.clone(); - let req = req.clone(); - joinset.spawn(async move { - // Unwrap is fine as we already would have returned error above - let sk_p = safekeepers.get(&sk_id).unwrap(); - let res = sk_p - .with_client_retries( - |client| { - let req = req.clone(); - async move { client.create_timeline(&req).await } - }, - &jwt, - &ssl_ca_cert, - 3, - 3, - SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, - &CancellationToken::new(), - ) - .await; - (sk_id, sk_p.skp.host.clone(), res) - }); - } - // After we have built the joinset, we now wait for the tasks to complete, - // but with a specified timeout to make sure we return swiftly, either with - // a failure or success. - let reconcile_deadline = tokio::time::Instant::now() + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT; - - // Wait until all tasks finish or timeout is hit, whichever occurs - // first. - let mut reconcile_results = Vec::new(); - loop { - if let Ok(res) = tokio::time::timeout_at(reconcile_deadline, joinset.join_next()).await - { - let Some(res) = res else { break }; - match res { - Ok(res) => { - tracing::info!( - "response from safekeeper id:{} at {}: {:?}", - res.0, - res.1, - res.2 - ); - reconcile_results.push(res); - } - Err(join_err) => { - tracing::info!("join_err for task in joinset: {join_err}"); - } - } - } else { - tracing::info!( - "timeout for creation call after {} responses", - reconcile_results.len() - ); - break; - } - } - - // Now check now if quorum was reached in reconcile_results. - let total_result_count = reconcile_results.len(); - let remaining = reconcile_results - .into_iter() - .filter_map(|res| res.2.is_err().then_some(res.0)) - .collect::>(); - tracing::info!( - "Got {} non-successful responses from initial creation request of total {total_result_count} responses", - remaining.len() - ); - if remaining.len() >= 2 { - // Failure - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "not enough successful reconciliations to reach quorum, please retry: {} errored", - remaining.len() - ))); - } - - Ok(remaining) - } - - /// Create timeline in controller database and on safekeepers. - /// `timeline_info` is result of timeline creation on pageserver. - /// - /// All actions must be idempotent as the call is retried until success. It - /// tries to create timeline in the db and on at least majority of - /// safekeepers + queue creation for safekeepers which missed it in the db - /// for infinite retries; after that, call returns Ok. - /// - /// The idea is that once this is reached as long as we have alive majority - /// of safekeepers it is expected to get eventually operational as storcon - /// will be able to seed timeline on nodes which missed creation by making - /// pull_timeline from peers. On the other hand we don't want to fail - /// timeline creation if one safekeeper is down. - async fn tenant_timeline_create_safekeepers( - self: &Arc, - tenant_id: TenantId, - timeline_info: &TimelineInfo, - create_mode: models::TimelineCreateRequestMode, - ) -> Result { - let timeline_id = timeline_info.timeline_id; - let pg_version = timeline_info.pg_version; - // Initially start_lsn is determined by last_record_lsn in pageserver - // response as it does initdb. However, later we persist it and in sk - // creation calls replace with the value from the timeline row if it - // previously existed as on retries in theory endpoint might have - // already written some data and advanced last_record_lsn, while we want - // safekeepers to have consistent start_lsn. - let start_lsn = match create_mode { - models::TimelineCreateRequestMode::Bootstrap { .. } => timeline_info.last_record_lsn, - models::TimelineCreateRequestMode::Branch { .. } => timeline_info.last_record_lsn, - models::TimelineCreateRequestMode::ImportPgdata { .. } => { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "import pgdata doesn't specify the start lsn, aborting creation on safekeepers" - )))?; - } - }; - // Choose initial set of safekeepers respecting affinity - let sks = self.safekeepers_for_new_timeline().await?; - let sks_persistence = sks.iter().map(|sk| sk.id.0 as i64).collect::>(); - // Add timeline to db - let mut timeline_persist = TimelinePersistence { - tenant_id: tenant_id.to_string(), - timeline_id: timeline_id.to_string(), - start_lsn: start_lsn.into(), - generation: 0, - sk_set: sks_persistence.clone(), - new_sk_set: None, - cplane_notified_generation: 0, - deleted_at: None, - }; - let inserted = self - .persistence - .insert_timeline(timeline_persist.clone()) - .await?; - if !inserted { - if let Some(existent_persist) = self - .persistence - .get_timeline(tenant_id, timeline_id) - .await? - { - // Replace with what we have in the db, to get stuff like the generation right. - // We do still repeat the http calls to the safekeepers. After all, we could have - // crashed right after the wrote to the DB. - timeline_persist = existent_persist; - } else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "insertion said timeline already in db, but looking it up, it was gone" - ))); - } - } - // Create the timeline on a quorum of safekeepers - let remaining = self - .tenant_timeline_create_safekeepers_quorum( - tenant_id, - timeline_id, - pg_version, - &timeline_persist, - ) - .await?; - - // For the remaining safekeepers, take care of their reconciliation asynchronously - for &remaining_id in remaining.iter() { - let pending_op = TimelinePendingOpPersistence { - tenant_id: tenant_id.to_string(), - timeline_id: timeline_id.to_string(), - generation: timeline_persist.generation, - op_kind: crate::persistence::SafekeeperTimelineOpKind::Pull, - sk_id: remaining_id.0 as i64, - }; - tracing::info!("writing pending op for sk id {remaining_id}"); - self.persistence.insert_pending_op(pending_op).await?; - } - if !remaining.is_empty() { - let mut locked = self.inner.write().unwrap(); - for remaining_id in remaining { - let Some(sk) = locked.safekeepers.get(&remaining_id) else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Couldn't find safekeeper with id {remaining_id}" - ))); - }; - let Ok(host_list) = sks - .iter() - .map(|sk| { - Ok(( - sk.id, - locked - .safekeepers - .get(&sk.id) - .ok_or_else(|| { - ApiError::InternalServerError(anyhow::anyhow!( - "Couldn't find safekeeper with id {remaining_id} to pull from" - )) - })? - .base_url(), - )) - }) - .collect::>() - else { - continue; - }; - let req = ScheduleRequest { - safekeeper: Box::new(sk.clone()), - host_list, - tenant_id, - timeline_id, - generation: timeline_persist.generation as u32, - kind: crate::persistence::SafekeeperTimelineOpKind::Pull, - }; - locked.safekeeper_reconcilers.schedule_request(self, req); - } - } - - Ok(SafekeepersInfo { - generation: timeline_persist.generation as u32, - safekeepers: sks, - tenant_id, - timeline_id, - }) - } - pub(crate) async fn tenant_timeline_create( self: &Arc, tenant_id: TenantId, @@ -4012,6 +3760,7 @@ impl Service { &self, tenant_id: TenantId, timeline_id: TimelineId, + behavior: Option, ) -> Result { tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",); @@ -4035,6 +3784,7 @@ impl Service { node: Node, jwt: Option, ssl_ca_cert: Option, + behavior: Option, ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { tracing::info!( "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", @@ -4044,7 +3794,7 @@ impl Service { .map_err(|e| passthrough_api_error(&node, e))?; client - .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .timeline_detach_ancestor(tenant_shard_id, timeline_id, behavior) .await .map_err(|e| { use mgmt_api::Error; @@ -4082,6 +3832,7 @@ impl Service { node, self.config.pageserver_jwt_token.clone(), self.config.ssl_ca_cert.clone(), + behavior, )) }) .await?; @@ -4236,7 +3987,8 @@ impl Service { /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// - /// On success, the returned vector contains exactly the same number of elements as the input `locations`. + /// On success, the returned vector contains exactly the same number of elements as the input `locations` + /// and returned element at index `i` is the result for `req_fn(op(locations[i])`. async fn tenant_for_shards( &self, locations: Vec<(TenantShardId, Node)>, @@ -4252,18 +4004,23 @@ impl Service { let mut futs = FuturesUnordered::new(); let mut results = Vec::with_capacity(locations.len()); - for (tenant_shard_id, node) in locations { - futs.push(req_fn(tenant_shard_id, node)); + for (idx, (tenant_shard_id, node)) in locations.into_iter().enumerate() { + let fut = req_fn(tenant_shard_id, node); + futs.push(async move { (idx, fut.await) }); } - while let Some(r) = futs.next().await { - results.push(r?); + while let Some((idx, r)) = futs.next().await { + results.push((idx, r?)); } - Ok(results) + results.sort_by_key(|(idx, _)| *idx); + Ok(results.into_iter().map(|(_, r)| r).collect()) } - /// Concurrently invoke a pageserver API call on many shards at once + /// Concurrently invoke a pageserver API call on many shards at once. + /// + /// The returned Vec has the same length as the `locations` Vec, + /// and returned element at index `i` is the result for `op(locations[i])`. pub(crate) async fn tenant_for_shards_api( &self, locations: Vec<(TenantShardId, Node)>, @@ -4280,27 +4037,29 @@ impl Service { let mut futs = FuturesUnordered::new(); let mut results = Vec::with_capacity(locations.len()); - for (tenant_shard_id, node) in locations { + for (idx, (tenant_shard_id, node)) in locations.into_iter().enumerate() { futs.push(async move { - node.with_client_retries( - |client| op(tenant_shard_id, client), - &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, - warn_threshold, - max_retries, - timeout, - cancel, - ) - .await + let r = node + .with_client_retries( + |client| op(tenant_shard_id, client), + &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, + warn_threshold, + max_retries, + timeout, + cancel, + ) + .await; + (idx, r) }); } - while let Some(r) = futs.next().await { - let r = r.unwrap_or(Err(mgmt_api::Error::Cancelled)); - results.push(r); + while let Some((idx, r)) = futs.next().await { + results.push((idx, r.unwrap_or(Err(mgmt_api::Error::Cancelled)))); } - results + results.sort_by_key(|(idx, _)| *idx); + results.into_iter().map(|(_, r)| r).collect() } /// Helper for safely working with the shards in a tenant remotely on pageservers, for example @@ -4577,62 +4336,6 @@ impl Service { status_code } - /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. - async fn tenant_timeline_delete_safekeepers( - self: &Arc, - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> Result<(), ApiError> { - let tl = self - .persistence - .get_timeline(tenant_id, timeline_id) - .await?; - let Some(tl) = tl else { - tracing::info!( - "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table, no deletions on safekeepers needed" - ); - return Ok(()); - }; - let all_sks = tl - .new_sk_set - .iter() - .flat_map(|sks| { - sks.iter() - .map(|sk| (*sk, SafekeeperTimelineOpKind::Exclude)) - }) - .chain( - tl.sk_set - .iter() - .map(|v| (*v, SafekeeperTimelineOpKind::Delete)), - ) - .collect::>(); - - // Schedule reconciliations - { - let mut locked = self.inner.write().unwrap(); - for (sk_id, kind) in all_sks { - let sk_id = NodeId(sk_id as u64); - let Some(sk) = locked.safekeepers.get(&sk_id) else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Couldn't find safekeeper with id {sk_id}" - ))); - }; - - let req = ScheduleRequest { - safekeeper: Box::new(sk.clone()), - // we don't use this for this kind, put a dummy value - host_list: Vec::new(), - tenant_id, - timeline_id, - generation: tl.generation as u32, - kind, - }; - locked.safekeeper_reconcilers.schedule_request(self, req); - } - } - Ok(()) - } - /// When you know the TenantId but not a specific shard, and would like to get the node holding shard 0. pub(crate) async fn tenant_shard0_node( &self, @@ -5713,7 +5416,7 @@ impl Service { // it doesn't match, but that requires more retry logic on this side) self.persistence - .complete_shard_split(tenant_id, old_shard_count) + .complete_shard_split(tenant_id, old_shard_count, new_shard_count) .await?; fail::fail_point!("shard-split-post-complete", |_| Err( @@ -7865,6 +7568,9 @@ impl Service { /// At most one tenant will be split per call: the one with the largest max logical size. It /// will split 1 → 8 shards. /// + /// An unsharded tenant will get DEFAULT_STRIPE_SIZE, regardless of what its ShardIdentity says. + /// A sharded tenant will retain its stripe size, as splits do not allow changing it. + /// /// TODO: consider splitting based on total logical size rather than max logical size. /// /// TODO: consider spawning multiple splits in parallel: this is only called once every 20 @@ -7910,6 +7616,16 @@ impl Service { "Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}" ); + // Retain the stripe size of sharded tenants, as splits don't allow changing it. Otherwise, + // use DEFAULT_STRIPE_SIZE for unsharded tenants -- their stripe size doesn't really matter, + // and if we change the default stripe size we want to use the new default rather than an + // old, persisted stripe size. + let new_stripe_size = match split_candidate.id.shard_count.count() { + 0 => panic!("invalid shard count 0"), + 1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE), + 2.. => None, + }; + let this = self.clone(); tokio::spawn( async move { @@ -7923,7 +7639,7 @@ impl Service { // because our max shard count is relatively low anyway. This policy // will be adjusted in future once we support higher shard count. new_shard_count: MAX_SHARDS.literal(), - new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE), + new_stripe_size, }, ) .await @@ -8666,168 +8382,6 @@ impl Service { global_observed } - /// Choose safekeepers for the new timeline: 3 in different azs. - pub(crate) async fn safekeepers_for_new_timeline( - &self, - ) -> Result, ApiError> { - let mut all_safekeepers = { - let locked = self.inner.read().unwrap(); - locked - .safekeepers - .iter() - .filter_map(|sk| { - if sk.1.scheduling_policy() != SkSchedulingPolicy::Active { - // If we don't want to schedule stuff onto the safekeeper, respect that. - return None; - } - let utilization_opt = if let SafekeeperState::Available { - last_seen_at: _, - utilization, - } = sk.1.availability() - { - Some(utilization) - } else { - // non-available safekeepers still get a chance for new timelines, - // but put them last in the list. - None - }; - let info = SafekeeperInfo { - hostname: sk.1.skp.host.clone(), - id: NodeId(sk.1.skp.id as u64), - }; - Some((utilization_opt, info, sk.1.skp.availability_zone_id.clone())) - }) - .collect::>() - }; - all_safekeepers.sort_by_key(|sk| { - ( - sk.0.as_ref() - .map(|ut| ut.timeline_count) - .unwrap_or(u64::MAX), - // Use the id to decide on equal scores for reliability - sk.1.id.0, - ) - }); - let mut sks = Vec::new(); - let mut azs = HashSet::new(); - for (_sk_util, sk_info, az_id) in all_safekeepers.iter() { - if !azs.insert(az_id) { - continue; - } - sks.push(sk_info.clone()); - if sks.len() == 3 { - break; - } - } - if sks.len() == 3 { - Ok(sks) - } else { - Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find three safekeepers in different AZs for new timeline" - ))) - } - } - - pub(crate) async fn safekeepers_list( - &self, - ) -> Result, DatabaseError> { - let locked = self.inner.read().unwrap(); - let mut list = locked - .safekeepers - .iter() - .map(|sk| sk.1.describe_response()) - .collect::, _>>()?; - list.sort_by_key(|v| v.id); - Ok(list) - } - - pub(crate) async fn get_safekeeper( - &self, - id: i64, - ) -> Result { - let locked = self.inner.read().unwrap(); - let sk = locked - .safekeepers - .get(&NodeId(id as u64)) - .ok_or(diesel::result::Error::NotFound)?; - sk.describe_response() - } - - pub(crate) async fn upsert_safekeeper( - &self, - record: crate::persistence::SafekeeperUpsert, - ) -> Result<(), ApiError> { - let node_id = NodeId(record.id as u64); - let use_https = self.config.use_https_safekeeper_api; - - if use_https && record.https_port.is_none() { - return Err(ApiError::PreconditionFailed( - format!( - "cannot upsert safekeeper {node_id}: \ - https is enabled, but https port is not specified" - ) - .into(), - )); - } - - self.persistence.safekeeper_upsert(record.clone()).await?; - { - let mut locked = self.inner.write().unwrap(); - let mut safekeepers = (*locked.safekeepers).clone(); - match safekeepers.entry(node_id) { - std::collections::hash_map::Entry::Occupied(mut entry) => entry - .get_mut() - .update_from_record(record) - .expect("all preconditions should be checked before upsert to database"), - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert( - Safekeeper::from_persistence( - crate::persistence::SafekeeperPersistence::from_upsert( - record, - SkSchedulingPolicy::Pause, - ), - CancellationToken::new(), - use_https, - ) - .expect("all preconditions should be checked before upsert to database"), - ); - } - } - locked.safekeepers = Arc::new(safekeepers); - } - Ok(()) - } - - pub(crate) async fn set_safekeeper_scheduling_policy( - &self, - id: i64, - scheduling_policy: SkSchedulingPolicy, - ) -> Result<(), DatabaseError> { - self.persistence - .set_safekeeper_scheduling_policy(id, scheduling_policy) - .await?; - let node_id = NodeId(id as u64); - // After the change has been persisted successfully, update the in-memory state - { - let mut locked = self.inner.write().unwrap(); - let mut safekeepers = (*locked.safekeepers).clone(); - let sk = safekeepers - .get_mut(&node_id) - .ok_or(DatabaseError::Logical("Not found".to_string()))?; - sk.set_scheduling_policy(scheduling_policy); - - match scheduling_policy { - SkSchedulingPolicy::Active => (), - SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => { - locked.safekeeper_reconcilers.cancel_safekeeper(node_id); - } - } - - locked.safekeepers = Arc::new(safekeepers); - } - Ok(()) - } - pub(crate) async fn update_shards_preferred_azs( &self, req: ShardsPreferredAzsRequest, diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs new file mode 100644 index 0000000000..b5fb00a469 --- /dev/null +++ b/storage_controller/src/service/safekeeper_service.rs @@ -0,0 +1,518 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; + +use super::safekeeper_reconciler::ScheduleRequest; +use crate::heartbeater::SafekeeperState; +use crate::persistence::{ + DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, +}; +use crate::safekeeper::Safekeeper; +use http_utils::error::ApiError; +use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; +use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo}; +use safekeeper_api::membership::{MemberSet, SafekeeperId}; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::logging::SecretString; + +use super::Service; + +impl Service { + /// Timeline creation on safekeepers + /// + /// Returns `Ok(left)` if the timeline has been created on a quorum of safekeepers, + /// where `left` contains the list of safekeepers that didn't have a successful response. + /// Assumes tenant lock is held while calling this function. + pub(super) async fn tenant_timeline_create_safekeepers_quorum( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: u32, + timeline_persistence: &TimelinePersistence, + ) -> Result, ApiError> { + // If quorum is reached, return if we are outside of a specified timeout + let jwt = self + .config + .safekeeper_jwt_token + .clone() + .map(SecretString::from); + let mut joinset = JoinSet::new(); + + let safekeepers = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + + let mut members = Vec::new(); + for sk_id in timeline_persistence.sk_set.iter() { + let sk_id = NodeId(*sk_id as u64); + let Some(safekeeper) = safekeepers.get(&sk_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "couldn't find entry for safekeeper with id {sk_id}" + )))?; + }; + members.push(SafekeeperId { + id: sk_id, + host: safekeeper.skp.host.clone(), + pg_port: safekeeper.skp.port as u16, + }); + } + let mset = MemberSet::new(members).map_err(ApiError::InternalServerError)?; + let mconf = safekeeper_api::membership::Configuration::new(mset); + + let req = safekeeper_api::models::TimelineCreateRequest { + commit_lsn: None, + mconf, + pg_version, + start_lsn: timeline_persistence.start_lsn.0, + system_id: None, + tenant_id, + timeline_id, + wal_seg_size: None, + }; + const SK_CREATE_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + for sk in timeline_persistence.sk_set.iter() { + let sk_id = NodeId(*sk as u64); + let safekeepers = safekeepers.clone(); + let jwt = jwt.clone(); + let ssl_ca_cert = self.config.ssl_ca_cert.clone(); + let req = req.clone(); + joinset.spawn(async move { + // Unwrap is fine as we already would have returned error above + let sk_p = safekeepers.get(&sk_id).unwrap(); + let res = sk_p + .with_client_retries( + |client| { + let req = req.clone(); + async move { client.create_timeline(&req).await } + }, + &jwt, + &ssl_ca_cert, + 3, + 3, + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, + &CancellationToken::new(), + ) + .await; + (sk_id, sk_p.skp.host.clone(), res) + }); + } + // After we have built the joinset, we now wait for the tasks to complete, + // but with a specified timeout to make sure we return swiftly, either with + // a failure or success. + let reconcile_deadline = tokio::time::Instant::now() + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT; + + // Wait until all tasks finish or timeout is hit, whichever occurs + // first. + let mut reconcile_results = Vec::new(); + loop { + if let Ok(res) = tokio::time::timeout_at(reconcile_deadline, joinset.join_next()).await + { + let Some(res) = res else { break }; + match res { + Ok(res) => { + tracing::info!( + "response from safekeeper id:{} at {}: {:?}", + res.0, + res.1, + res.2 + ); + reconcile_results.push(res); + } + Err(join_err) => { + tracing::info!("join_err for task in joinset: {join_err}"); + } + } + } else { + tracing::info!( + "timeout for creation call after {} responses", + reconcile_results.len() + ); + break; + } + } + + // Now check now if quorum was reached in reconcile_results. + let total_result_count = reconcile_results.len(); + let remaining = reconcile_results + .into_iter() + .filter_map(|res| res.2.is_err().then_some(res.0)) + .collect::>(); + tracing::info!( + "Got {} non-successful responses from initial creation request of total {total_result_count} responses", + remaining.len() + ); + if remaining.len() >= 2 { + // Failure + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "not enough successful reconciliations to reach quorum, please retry: {} errored", + remaining.len() + ))); + } + + Ok(remaining) + } + + /// Create timeline in controller database and on safekeepers. + /// `timeline_info` is result of timeline creation on pageserver. + /// + /// All actions must be idempotent as the call is retried until success. It + /// tries to create timeline in the db and on at least majority of + /// safekeepers + queue creation for safekeepers which missed it in the db + /// for infinite retries; after that, call returns Ok. + /// + /// The idea is that once this is reached as long as we have alive majority + /// of safekeepers it is expected to get eventually operational as storcon + /// will be able to seed timeline on nodes which missed creation by making + /// pull_timeline from peers. On the other hand we don't want to fail + /// timeline creation if one safekeeper is down. + pub(super) async fn tenant_timeline_create_safekeepers( + self: &Arc, + tenant_id: TenantId, + timeline_info: &TimelineInfo, + create_mode: models::TimelineCreateRequestMode, + ) -> Result { + let timeline_id = timeline_info.timeline_id; + let pg_version = timeline_info.pg_version * 10000; + // Initially start_lsn is determined by last_record_lsn in pageserver + // response as it does initdb. However, later we persist it and in sk + // creation calls replace with the value from the timeline row if it + // previously existed as on retries in theory endpoint might have + // already written some data and advanced last_record_lsn, while we want + // safekeepers to have consistent start_lsn. + let start_lsn = match create_mode { + models::TimelineCreateRequestMode::Bootstrap { .. } => timeline_info.last_record_lsn, + models::TimelineCreateRequestMode::Branch { .. } => timeline_info.last_record_lsn, + models::TimelineCreateRequestMode::ImportPgdata { .. } => { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "import pgdata doesn't specify the start lsn, aborting creation on safekeepers" + )))?; + } + }; + // Choose initial set of safekeepers respecting affinity + let sks = self.safekeepers_for_new_timeline().await?; + let sks_persistence = sks.iter().map(|sk| sk.id.0 as i64).collect::>(); + // Add timeline to db + let mut timeline_persist = TimelinePersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + start_lsn: start_lsn.into(), + generation: 0, + sk_set: sks_persistence.clone(), + new_sk_set: None, + cplane_notified_generation: 0, + deleted_at: None, + }; + let inserted = self + .persistence + .insert_timeline(timeline_persist.clone()) + .await?; + if !inserted { + if let Some(existent_persist) = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await? + { + // Replace with what we have in the db, to get stuff like the generation right. + // We do still repeat the http calls to the safekeepers. After all, we could have + // crashed right after the wrote to the DB. + timeline_persist = existent_persist; + } else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "insertion said timeline already in db, but looking it up, it was gone" + ))); + } + } + // Create the timeline on a quorum of safekeepers + let remaining = self + .tenant_timeline_create_safekeepers_quorum( + tenant_id, + timeline_id, + pg_version, + &timeline_persist, + ) + .await?; + + // For the remaining safekeepers, take care of their reconciliation asynchronously + for &remaining_id in remaining.iter() { + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: timeline_persist.generation, + op_kind: crate::persistence::SafekeeperTimelineOpKind::Pull, + sk_id: remaining_id.0 as i64, + }; + tracing::info!("writing pending op for sk id {remaining_id}"); + self.persistence.insert_pending_op(pending_op).await?; + } + if !remaining.is_empty() { + let mut locked = self.inner.write().unwrap(); + for remaining_id in remaining { + let Some(sk) = locked.safekeepers.get(&remaining_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {remaining_id}" + ))); + }; + let Ok(host_list) = sks + .iter() + .map(|sk| { + Ok(( + sk.id, + locked + .safekeepers + .get(&sk.id) + .ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {remaining_id} to pull from" + )) + })? + .base_url(), + )) + }) + .collect::>() + else { + continue; + }; + let req = ScheduleRequest { + safekeeper: Box::new(sk.clone()), + host_list, + tenant_id, + timeline_id, + generation: timeline_persist.generation as u32, + kind: crate::persistence::SafekeeperTimelineOpKind::Pull, + }; + locked.safekeeper_reconcilers.schedule_request(self, req); + } + } + + Ok(SafekeepersInfo { + generation: timeline_persist.generation as u32, + safekeepers: sks, + tenant_id, + timeline_id, + }) + } + /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. + pub(super) async fn tenant_timeline_delete_safekeepers( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result<(), ApiError> { + let tl = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + let Some(tl) = tl else { + tracing::info!( + "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table, no deletions on safekeepers needed" + ); + return Ok(()); + }; + let all_sks = tl + .new_sk_set + .iter() + .flat_map(|sks| { + sks.iter() + .map(|sk| (*sk, SafekeeperTimelineOpKind::Exclude)) + }) + .chain( + tl.sk_set + .iter() + .map(|v| (*v, SafekeeperTimelineOpKind::Delete)), + ) + .collect::>(); + + // Schedule reconciliations + { + let mut locked = self.inner.write().unwrap(); + for (sk_id, kind) in all_sks { + let sk_id = NodeId(sk_id as u64); + let Some(sk) = locked.safekeepers.get(&sk_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {sk_id}" + ))); + }; + + let req = ScheduleRequest { + safekeeper: Box::new(sk.clone()), + // we don't use this for this kind, put a dummy value + host_list: Vec::new(), + tenant_id, + timeline_id, + generation: tl.generation as u32, + kind, + }; + locked.safekeeper_reconcilers.schedule_request(self, req); + } + } + Ok(()) + } + + /// Choose safekeepers for the new timeline: 3 in different azs. + pub(crate) async fn safekeepers_for_new_timeline( + &self, + ) -> Result, ApiError> { + // Number of safekeepers in different AZs we are looking for + let wanted_count = 3; + let mut all_safekeepers = { + let locked = self.inner.read().unwrap(); + locked + .safekeepers + .iter() + .filter_map(|sk| { + if sk.1.scheduling_policy() != SkSchedulingPolicy::Active { + // If we don't want to schedule stuff onto the safekeeper, respect that. + return None; + } + let utilization_opt = if let SafekeeperState::Available { + last_seen_at: _, + utilization, + } = sk.1.availability() + { + Some(utilization) + } else { + // non-available safekeepers still get a chance for new timelines, + // but put them last in the list. + None + }; + let info = SafekeeperInfo { + hostname: sk.1.skp.host.clone(), + id: NodeId(sk.1.skp.id as u64), + }; + Some((utilization_opt, info, sk.1.skp.availability_zone_id.clone())) + }) + .collect::>() + }; + all_safekeepers.sort_by_key(|sk| { + ( + sk.0.as_ref() + .map(|ut| ut.timeline_count) + .unwrap_or(u64::MAX), + // Use the id to decide on equal scores for reliability + sk.1.id.0, + ) + }); + let mut sks = Vec::new(); + let mut azs = HashSet::new(); + for (_sk_util, sk_info, az_id) in all_safekeepers.iter() { + if !azs.insert(az_id) { + continue; + } + sks.push(sk_info.clone()); + if sks.len() == wanted_count { + break; + } + } + if sks.len() == wanted_count { + Ok(sks) + } else { + Err(ApiError::InternalServerError(anyhow::anyhow!( + "couldn't find {wanted_count} safekeepers in different AZs for new timeline (found: {}, total active: {})", + sks.len(), + all_safekeepers.len(), + ))) + } + } + + pub(crate) async fn safekeepers_list( + &self, + ) -> Result, DatabaseError> { + let locked = self.inner.read().unwrap(); + let mut list = locked + .safekeepers + .iter() + .map(|sk| sk.1.describe_response()) + .collect::, _>>()?; + list.sort_by_key(|v| v.id); + Ok(list) + } + + pub(crate) async fn get_safekeeper( + &self, + id: i64, + ) -> Result { + let locked = self.inner.read().unwrap(); + let sk = locked + .safekeepers + .get(&NodeId(id as u64)) + .ok_or(diesel::result::Error::NotFound)?; + sk.describe_response() + } + + pub(crate) async fn upsert_safekeeper( + &self, + record: crate::persistence::SafekeeperUpsert, + ) -> Result<(), ApiError> { + let node_id = NodeId(record.id as u64); + let use_https = self.config.use_https_safekeeper_api; + + if use_https && record.https_port.is_none() { + return Err(ApiError::PreconditionFailed( + format!( + "cannot upsert safekeeper {node_id}: \ + https is enabled, but https port is not specified" + ) + .into(), + )); + } + + self.persistence.safekeeper_upsert(record.clone()).await?; + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + match safekeepers.entry(node_id) { + std::collections::hash_map::Entry::Occupied(mut entry) => entry + .get_mut() + .update_from_record(record) + .expect("all preconditions should be checked before upsert to database"), + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert( + Safekeeper::from_persistence( + crate::persistence::SafekeeperPersistence::from_upsert( + record, + SkSchedulingPolicy::Pause, + ), + CancellationToken::new(), + use_https, + ) + .expect("all preconditions should be checked before upsert to database"), + ); + } + } + locked.safekeepers = Arc::new(safekeepers); + } + Ok(()) + } + + pub(crate) async fn set_safekeeper_scheduling_policy( + &self, + id: i64, + scheduling_policy: SkSchedulingPolicy, + ) -> Result<(), DatabaseError> { + self.persistence + .set_safekeeper_scheduling_policy(id, scheduling_policy) + .await?; + let node_id = NodeId(id as u64); + // After the change has been persisted successfully, update the in-memory state + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + let sk = safekeepers + .get_mut(&node_id) + .ok_or(DatabaseError::Logical("Not found".to_string()))?; + sk.set_scheduling_policy(scheduling_policy); + + match scheduling_policy { + SkSchedulingPolicy::Active => (), + SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => { + locked.safekeeper_reconcilers.cancel_safekeeper(node_id); + } + } + + locked.safekeepers = Arc::new(safekeepers); + } + Ok(()) + } +} diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 425abef935..205b9141e0 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -19,7 +19,7 @@ if TYPE_CHECKING: class ComputeReconfigure: def __init__(self, server: HTTPServer): self.server = server - self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" + self.control_plane_hooks_api = f"http://{server.host}:{server.port}/" self.workloads: dict[TenantId, Any] = {} self.on_notify: Callable[[Any], None] | None = None diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 83a1a87611..54e6458ac6 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -175,6 +175,9 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( counter("pageserver_tenant_throttling_count"), counter("pageserver_timeline_wal_records_received"), counter("pageserver_page_service_pagestream_flush_in_progress_micros"), + counter("pageserver_wait_lsn_in_progress_micros"), + counter("pageserver_wait_lsn_started_count"), + counter("pageserver_wait_lsn_finished_count"), *histogram("pageserver_page_service_batch_size"), *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"), *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7bc746d668..aba8e04977 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -460,12 +460,15 @@ class NeonEnvBuilder: self.overlay_mounts_created_by_us: list[tuple[str, Path]] = [] self.config_init_force: str | None = None self.top_output_dir = top_output_dir - self.control_plane_compute_hook_api: str | None = None + self.control_plane_hooks_api: str | None = None self.storage_controller_config: dict[Any, Any] | None = None # Flag to enable https listener in pageserver, generate local ssl certs, # and force storage controller to use https for pageserver api. self.use_https_pageserver_api: bool = False + # Flag to enable https listener in safekeeper, generate local ssl certs, + # and force storage controller to use https for safekeeper api. + self.use_https_safekeeper_api: bool = False self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine self.pageserver_get_vectored_concurrent_io: str | None = ( @@ -1063,7 +1066,9 @@ class NeonEnv: self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline - self.generate_local_ssl_certs = config.use_https_pageserver_api + self.generate_local_ssl_certs = ( + config.use_https_pageserver_api or config.use_https_safekeeper_api + ) self.ssl_ca_file = ( self.repo_dir.joinpath("rootCA.crt") if self.generate_local_ssl_certs else None ) @@ -1116,7 +1121,7 @@ class NeonEnv: self.control_plane_api: str = self.storage_controller.upcall_api_endpoint() # For testing this with a fake HTTP server, enable passing through a URL from config - self.control_plane_compute_hook_api = config.control_plane_compute_hook_api + self.control_plane_hooks_api = config.control_plane_hooks_api self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode @@ -1137,8 +1142,8 @@ class NeonEnv: if self.control_plane_api is not None: cfg["control_plane_api"] = self.control_plane_api - if self.control_plane_compute_hook_api is not None: - cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api + if self.control_plane_hooks_api is not None: + cfg["control_plane_hooks_api"] = self.control_plane_hooks_api storage_controller_config = self.storage_controller_config @@ -1146,6 +1151,10 @@ class NeonEnv: storage_controller_config = storage_controller_config or {} storage_controller_config["use_https_pageserver_api"] = True + if config.use_https_safekeeper_api: + storage_controller_config = storage_controller_config or {} + storage_controller_config["use_https_safekeeper_api"] = True + if storage_controller_config is not None: cfg["storage_controller"] = storage_controller_config @@ -1248,6 +1257,7 @@ class NeonEnv: pg=self.port_distributor.get_port(), pg_tenant_only=self.port_distributor.get_port(), http=self.port_distributor.get_port(), + https=self.port_distributor.get_port() if config.use_https_safekeeper_api else None, ) id = config.safekeepers_id_start + i # assign ids sequentially sk_cfg: dict[str, Any] = { @@ -1255,6 +1265,7 @@ class NeonEnv: "pg_port": port.pg, "pg_tenant_only_port": port.pg_tenant_only, "http_port": port.http, + "https_port": port.https, "sync": config.safekeepers_enable_fsync, } if config.auth_enabled: @@ -1310,6 +1321,28 @@ class NeonEnv: for f in futs: f.result() + # Last step: register safekeepers at the storage controller + if ( + self.storage_controller_config is not None + and self.storage_controller_config.get("timelines_onto_safekeepers") is True + ): + for sk_id, sk in enumerate(self.safekeepers): + body = { + "id": sk_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "127.0.0.1", + "port": sk.port.pg, + "http_port": sk.port.http, + "https_port": None, + "version": 5957, + "availability_zone_id": f"us-east-2b-{sk_id}", + } + + self.storage_controller.on_safekeeper_deploy(sk_id, body) + self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active") + def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. @@ -4475,6 +4508,7 @@ class SafekeeperPort: pg: int pg_tenant_only: int http: int + https: int | None @dataclass diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 0efe0b9575..13cab448f3 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1070,11 +1070,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, batch_size: int | None = None, + detach_behavior: str | None = None, **kwargs, ) -> set[TimelineId]: - params = {} + params: dict[str, Any] = {} if batch_size is not None: params["batch_size"] = batch_size + if detach_behavior: + params["detach_behavior"] = detach_behavior res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", params=params, diff --git a/test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql b/test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql new file mode 100644 index 0000000000..4c5b3fbd11 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql @@ -0,0 +1,162 @@ +\set min_id 1 +\set max_id 1500000000 +\set range_size 100 + +-- Use uniform random instead of random_zipfian +\set random_id random(:min_id, :max_id) +\set random_mar_id random(1, 65536) +\set random_delete_id random(:min_id, :max_id) + +-- Update exactly one row (if it exists) using the uniformly chosen random_id +UPDATE transaction.transaction + SET state = 'COMPLETED', + settlement_date = CURRENT_DATE, + mar_identifier = (:random_mar_id)::int + WHERE id = (:random_id)::bigint; + +-- Insert exactly one row +INSERT INTO transaction.transaction ( + user_id, + card_id, + business_id, + preceding_transaction_id, + is_last, + is_mocked, + type, + state, + network, + subnetwork, + user_transaction_time, + settlement_date, + request_amount, + amount, + currency_code, + approval_code, + response, + gpa, + gpa_order_unload, + gpa_order, + program_transfer, + fee_transfer, + peer_transfer, + msa_orders, + risk_assessment, + auto_reload, + direct_deposit, + polarity, + real_time_fee_group, + fee, + chargeback, + standin_approved_by, + acquirer_fee_amount, + funded_account_holder, + digital_wallet_token, + network_fees, + card_security_code_verification, + fraud, + cardholder_authentication_data, + currency_conversion, + merchant, + store, + card_acceptor, + acquirer, + pos, + avs, + mar_token, + mar_preceding_related_transaction_token, + mar_business_token, + mar_acting_user_token, + mar_card_token, + mar_duration, + mar_created_time, + issuer_interchange_amount, + offer_orders, + transaction_canonical_id, + mar_identifier, + created_at, + card_acceptor_mid, + card_acceptor_name, + address_verification, + issuing_product, + mar_enhanced_data_token, + standin_reason +) +SELECT + (:random_id % 100000) + 1 AS user_id, + (:random_id % 500000) + 1 AS card_id, + (:random_id % 20000) + 1 AS business_id, + NULL AS preceding_transaction_id, + (:random_id % 2) = 0 AS is_last, + (:random_id % 5) = 0 AS is_mocked, + 'authorization' AS type, + 'PENDING' AS state, + 'VISA' AS network, + 'VISANET' AS subnetwork, + now() - ((:random_id % 100) || ' days')::interval AS user_transaction_time, + now() - ((:random_id % 100) || ' days')::interval AS settlement_date, + random() * 1000 AS request_amount, + random() * 1000 AS amount, + 'USD' AS currency_code, + md5((:random_id)::text) AS approval_code, + '{}'::jsonb AS response, + '{}'::jsonb AS gpa, + '{}'::jsonb AS gpa_order_unload, + '{}'::jsonb AS gpa_order, + '{}'::jsonb AS program_transfer, + '{}'::jsonb AS fee_transfer, + '{}'::jsonb AS peer_transfer, + '{}'::jsonb AS msa_orders, + '{}'::jsonb AS risk_assessment, + '{}'::jsonb AS auto_reload, + '{}'::jsonb AS direct_deposit, + '{}'::jsonb AS polarity, + '{}'::jsonb AS real_time_fee_group, + '{}'::jsonb AS fee, + '{}'::jsonb AS chargeback, + NULL AS standin_approved_by, + random() * 100 AS acquirer_fee_amount, + '{}'::jsonb AS funded_account_holder, + '{}'::jsonb AS digital_wallet_token, + '{}'::jsonb AS network_fees, + '{}'::jsonb AS card_security_code_verification, + '{}'::jsonb AS fraud, + '{}'::jsonb AS cardholder_authentication_data, + '{}'::jsonb AS currency_conversion, + '{}'::jsonb AS merchant, + '{}'::jsonb AS store, + '{}'::jsonb AS card_acceptor, + '{}'::jsonb AS acquirer, + '{}'::jsonb AS pos, + '{}'::jsonb AS avs, + md5((:random_id)::text || 'token') AS mar_token, + NULL AS mar_preceding_related_transaction_token, + NULL AS mar_business_token, + NULL AS mar_acting_user_token, + NULL AS mar_card_token, + random() * 1000 AS mar_duration, + now() AS mar_created_time, + random() * 100 AS issuer_interchange_amount, + '{}'::jsonb AS offer_orders, + (:random_id % 500) + 1 AS transaction_canonical_id, + :random_id::integer AS mar_identifier, + now() AS created_at, + NULL AS card_acceptor_mid, + NULL AS card_acceptor_name, + '{}'::jsonb AS address_verification, + 'DEFAULT_PRODUCT' AS issuing_product, + NULL AS mar_enhanced_data_token, + NULL AS standin_reason +FROM (SELECT 1) AS dummy; + +-- Delete exactly one row using the uniformly chosen random_delete_id +WITH to_delete AS ( + SELECT id + FROM transaction.transaction + WHERE id >= (:random_delete_id)::bigint + AND id < ((:random_delete_id)::bigint + :range_size) + ORDER BY id + LIMIT 1 +) +DELETE FROM transaction.transaction +USING to_delete +WHERE transaction.transaction.id = to_delete.id; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql b/test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql new file mode 100644 index 0000000000..e0b0e52276 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql @@ -0,0 +1,25 @@ +-- enforce a controlled number of getpages prefetch requests from a range of +-- 40 million first pages (320 GB) of a 500 GiB table +-- the table has 55 million pages + + +-- Zipfian distributions model real-world access patterns where: +-- A few values (popular IDs) are accessed frequently. +-- Many values are accessed rarely. +-- This is useful for simulating realistic workloads + +\set alpha 1.2 +\set min_page 1 +\set max_page 40000000 + +\set zipf_random_page random_zipfian(:min_page, :max_page, :alpha) + +-- Read 500 consecutive pages from a Zipfian-distributed random start page +-- This enforces PostgreSQL prefetching +WITH random_page AS ( + SELECT :zipf_random_page::int AS start_page +) +SELECT MAX(created_at) +FROM webhook.incoming_webhooks +WHERE ctid >= (SELECT format('(%s,1)', start_page)::tid FROM random_page) +AND ctid < (SELECT format('(%s,1)', start_page + 500)::tid FROM random_page); \ No newline at end of file diff --git a/test_runner/performance/test_compute_ctl_api.py b/test_runner/performance/test_compute_ctl_api.py index 87eb1f2c35..d6d0a84e8e 100644 --- a/test_runner/performance/test_compute_ctl_api.py +++ b/test_runner/performance/test_compute_ctl_api.py @@ -41,24 +41,24 @@ def test_compute_ctl_api_latencies( zenbenchmark.record( "status_response_latency_p50_us", status_response_latency_us[len(status_response_latency_us) // 2], - "microseconds", + "μs", MetricReport.LOWER_IS_BETTER, ) zenbenchmark.record( "metrics_response_latency_p50_us", metrics_response_latency_us[len(metrics_response_latency_us) // 2], - "microseconds", + "μs", MetricReport.LOWER_IS_BETTER, ) zenbenchmark.record( "status_response_latency_p99_us", status_response_latency_us[len(status_response_latency_us) * 99 // 100], - "microseconds", + "μs", MetricReport.LOWER_IS_BETTER, ) zenbenchmark.record( "metrics_response_latency_p99_us", metrics_response_latency_us[len(metrics_response_latency_us) * 99 // 100], - "microseconds", + "μs", MetricReport.LOWER_IS_BETTER, ) diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index fdc56cc496..807ed522e1 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -23,6 +23,25 @@ if TYPE_CHECKING: from psycopg2.extensions import connection, cursor +""" +These benchmarks stress test logical replication within Neon. In order to run +them locally, they require setting up some infrastructure. See +https://docs.neon.build/compute/logical_replication_benchmarks.html for how to +do that. After setting that up, run the following shell commands. + +# These are the project IDs setup for the purposes of running these benchmarks +export BENCHMARK_PROJECT_ID_PUB= +export BENCHMARK_PROJECT_ID_SUB= + +# See https://neon.tech/docs/manage/api-keys +export NEON_API_KEY= + +# Fiddling with the --timeout parameter may be required depending on the +# performance of the benchmark +pytest -m remote_cluster 'test_runner/performance/test_logical_replication.py' +""" + + @pytest.mark.timeout(1000) def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg: VanillaPostgres): env = neon_simple_env diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py index ae00dbb3b5..842e6a904b 100644 --- a/test_runner/performance/test_perf_oltp_large_tenant.py +++ b/test_runner/performance/test_perf_oltp_large_tenant.py @@ -2,11 +2,13 @@ from __future__ import annotations import os import timeit +from contextlib import closing from pathlib import Path import pytest from fixtures.benchmark_fixture import PgBenchRunResult from fixtures.compare_fixtures import PgCompare +from fixtures.log_helper import log from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp @@ -82,9 +84,81 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): env.zenbenchmark.record_pg_bench_result(prefix, res) +def run_database_maintenance(env: PgCompare): + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + log.info("start vacuum analyze transaction.transaction") + with env.zenbenchmark.record_duration("vacuum_analyze"): + cur.execute("SET statement_timeout = 0;") + cur.execute("SET max_parallel_maintenance_workers = 7;") + cur.execute("SET maintenance_work_mem = '10GB';") + cur.execute("vacuum analyze transaction.transaction;") + log.info("finished vacuum analyze transaction.transaction") + + # recover previously failed or canceled re-indexing + cur.execute( + """ + DO $$ + DECLARE + invalid_index TEXT; + BEGIN + FOR invalid_index IN + SELECT c.relname + FROM pg_class c + JOIN pg_index i ON i.indexrelid = c.oid + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = 'transaction' + AND i.indisvalid = FALSE + AND c.relname LIKE '%_ccnew%' + LOOP + EXECUTE 'DROP INDEX IF EXISTS transaction.' || invalid_index; + END LOOP; + END $$; + """ + ) + # also recover failed or canceled re-indexing on toast part of table + cur.execute( + """ + DO $$ + DECLARE + invalid_index TEXT; + BEGIN + FOR invalid_index IN + SELECT c.relname + FROM pg_class c + JOIN pg_index i ON i.indexrelid = c.oid + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = 'pg_toast' + AND i.indisvalid = FALSE + AND c.relname LIKE '%_ccnew%' + AND i.indrelid = ( + SELECT reltoastrelid FROM pg_class + WHERE relname = 'transaction' + AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'transaction') + ) + LOOP + EXECUTE 'DROP INDEX IF EXISTS pg_toast.' || invalid_index; + END LOOP; + END $$; + """ + ) + + log.info("start REINDEX TABLE CONCURRENTLY transaction.transaction") + with env.zenbenchmark.record_duration("reindex concurrently"): + cur.execute("REINDEX TABLE CONCURRENTLY transaction.transaction;") + log.info("finished REINDEX TABLE CONCURRENTLY transaction.transaction") + + @pytest.mark.parametrize("custom_scripts", get_custom_scripts()) @pytest.mark.parametrize("duration", get_durations_matrix()) @pytest.mark.remote_cluster -def test_perf_oltp_large_tenant(remote_compare: PgCompare, custom_scripts: str, duration: int): +def test_perf_oltp_large_tenant_pgbench( + remote_compare: PgCompare, custom_scripts: str, duration: int +): run_test_pgbench(remote_compare, custom_scripts, duration) - # todo: run re-index, analyze, vacuum, etc. after the test and measure and report its duration + + +@pytest.mark.remote_cluster +def test_perf_oltp_large_tenant_maintenance(remote_compare: PgCompare): + # run analyze, vacuum, re-index after the test and measure and report its duration + run_database_maintenance(remote_compare) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 777b9e2870..e897d53cc8 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -83,9 +83,7 @@ def test_storage_controller_many_tenants( "max_offline": "30s", "max_warming_up": "300s", } - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api AZS = ["alpha", "bravo", "charlie"] diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 07600dd911..b56fcd3500 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -144,7 +144,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_l0_semaphore": False, "l0_flush_delay_threshold": 25, "l0_flush_stall_threshold": 42, - "l0_flush_wait_upload": False, + "l0_flush_wait_upload": True, "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 34e4e994cb..85d0cfbf1d 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -19,6 +19,7 @@ from fixtures.pageserver.utils import wait_until_tenant_active from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException +from requests.exceptions import RetryError # Test branch creation @@ -180,7 +181,6 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.endpoints.create_start( initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2 ) - ps_http.configure_failpoints(("before-upload-index-pausable", "off")) finally: env.pageserver.stop(immediate=True) @@ -221,10 +221,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder branch_id = TimelineId.generate() - with pytest.raises( - PageserverApiException, - match="Cannot branch off the timeline that's not present in pageserver", - ): + with pytest.raises(RetryError, match="too many 503 error responses"): ps_http.timeline_create( env.pg_version, env.initial_tenant, diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 41aa5b47ca..5526b783d5 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -23,8 +23,8 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver): ) env = neon_env_builder.init_start() - neon_env_builder.control_plane_compute_hook_api = ( - f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + neon_env_builder.control_plane_hooks_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/" ) def ignore_notify(request: Request): diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 0df88e14c2..c8cce7a4e7 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -524,6 +524,42 @@ def test_pageserver_gc_compaction_trigger(neon_env_builder: NeonEnvBuilder): workload.validate(env.pageserver.id) +def test_pageserver_small_tenant_compaction(neon_env_builder: NeonEnvBuilder): + """ + Create a small tenant that rarely needs compaction and ensure that everything works. + """ + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": 1024, + "lsn_lease_length": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(10000, env.pageserver.id) + + for _ in range(100): + workload.churn_rows(10, env.pageserver.id, upload=False, ingest=False) + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_compact(tenant_id, timeline_id) + ps_http.timeline_gc(tenant_id, timeline_id, None) + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + # Stripe sizes in number of pages. TINY_STRIPES = 16 LARGE_STRIPES = 32768 diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 2e7da86d9d..0d3618d1b8 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -16,6 +16,9 @@ TEST_ROLE_NAMES = [ {"name": "role \";with ';injections $$ $x$ $ %I !/\\&#@"}, {"name": '"role in double quotes"'}, {"name": "'role in single quotes'"}, + {"name": "role$"}, + {"name": "role$$"}, + {"name": "role$x$"}, ] TEST_DB_NAMES = [ @@ -59,6 +62,18 @@ TEST_DB_NAMES = [ "name": "'db in single quotes'", "owner": "'role in single quotes'", }, + { + "name": "db name$", + "owner": "role$", + }, + { + "name": "db name$$", + "owner": "role$$", + }, + { + "name": "db name$x$", + "owner": "role$x$", + }, ] diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index b360162dc1..85cd065a2f 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -3,12 +3,13 @@ from __future__ import annotations import enum import os import shutil -import sys from enum import StrEnum from logging import debug from pathlib import Path from typing import TYPE_CHECKING, cast +# Docs are available at https://jsonnet.org/ref/bindings.html#python_api +import _jsonnet import pytest import requests import yaml @@ -92,10 +93,6 @@ def jsonnet_evaluate_file( ext_vars: str | dict[str, str] | None = None, tla_vars: str | dict[str, str] | None = None, ) -> str: - # Jsonnet doesn't support Python 3.13 yet - # Docs are available at https://jsonnet.org/ref/bindings.html#python_api - import _jsonnet - return cast( "str", _jsonnet.evaluate_file( @@ -130,7 +127,6 @@ class SqlExporterProcess(StrEnum): AUTOSCALING = "autoscaling" -@pytest.mark.xfail(sys.version_info >= (3, 13), reason="Jsonnet doesn't support Python 3.13 yet") @pytest.mark.parametrize( "collector_name", ["neon_collector", "neon_collector_autoscaling"], @@ -359,7 +355,6 @@ else: self.__proc.wait() -@pytest.mark.xfail(sys.version_info >= (3, 13), reason="Jsonnet doesn't support Python 3.13 yet") @pytest.mark.parametrize( "exporter", [SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING], diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 130db009c9..9f2aa5df8c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -87,8 +87,8 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=s3_storage(), ) - neon_env_builder.control_plane_compute_hook_api = ( - f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + neon_env_builder.control_plane_hooks_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/" ) def ignore_notify(request: Request): diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index c39c74fa2a..e8721f1ea0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -29,7 +29,6 @@ from fixtures.remote_storage import ( from fixtures.utils import ( assert_eq, assert_ge, - assert_gt, print_gc_result, query_scalar, wait_until, @@ -334,14 +333,12 @@ def test_remote_storage_upload_queue_retries( # Exponential back-off in upload queue, so, gracious timeouts. wait_until( - lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30 + lambda: assert_ge(get_queued_count(file_kind="layer", op_kind="upload"), 1), timeout=30 ) wait_until( lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30 ) - wait_until( - lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30 - ) + # There may or may not be deletes queued up behind conflicting uploads; don't check. # unblock churn operations configure_storage_sync_failpoints("off") @@ -786,54 +783,6 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv create_thread.join() -def test_paused_upload_stalls_checkpoint( - neon_env_builder: NeonEnvBuilder, -): - """ - This test checks that checkpoints block on uploads to remote storage. - """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start( - initial_tenant_conf={ - # Set a small compaction threshold - "compaction_threshold": "3", - # Disable GC - "gc_period": "0s", - # disable PITR - "pitr_interval": "0s", - } - ) - - env.pageserver.allowed_errors.append( - f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" - ) - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - client = env.pageserver.http_client() - layers_at_creation = client.layer_map_info(tenant_id, timeline_id) - deltas_at_creation = len(layers_at_creation.delta_layers()) - assert ( - deltas_at_creation == 1 - ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle" - - # Make new layer uploads get stuck. - # Note that timeline creation waits for the initial layers to reach remote storage. - # So at this point, the `layers_at_creation` are in remote storage. - client.configure_failpoints(("before-upload-layer-pausable", "pause")) - - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - # Build two tables with some data inside - endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - - with pytest.raises(ReadTimeout): - client.timeline_checkpoint(tenant_id, timeline_id, timeout=5) - client.configure_failpoints(("before-upload-layer-pausable", "off")) - - def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index cb28f5b12d..b98ac8e50a 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -794,7 +794,7 @@ def test_sharding_split_stripe_size( Check that modifying stripe size inline with a shard split works as expected """ (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" neon_env_builder.num_pageservers = 1 # Set up fake HTTP notify endpoint: we will use this to validate that we receive @@ -806,7 +806,7 @@ def test_sharding_split_stripe_size( notifications.append(request.json) return Response(status=200) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) env = neon_env_builder.init_start( initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size @@ -1312,9 +1312,7 @@ def test_sharding_split_failures( failure: Failure, ): neon_env_builder.num_pageservers = 4 - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api initial_shard_count = 2 split_shard_count = 4 diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py index 25d839aa42..7db4a16f49 100644 --- a/test_runner/regress/test_ssl.py +++ b/test_runner/regress/test_ssl.py @@ -1,5 +1,7 @@ +import pytest import requests -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, StorageControllerApiException +from fixtures.utils import wait_until def test_pageserver_https_api(neon_env_builder: NeonEnvBuilder): @@ -13,3 +15,54 @@ def test_pageserver_https_api(neon_env_builder: NeonEnvBuilder): addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status" requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + +def test_safekeeper_https_api(neon_env_builder: NeonEnvBuilder): + """ + Test HTTPS safekeeper management API. + 1. Make /v1/status request to HTTPS API to ensure it's appropriately configured. + 2. Try to register safekeeper in storcon with https port missing. + 3. Register safekeeper with https port. + 4. Wait for a heartbeat round to complete. + """ + neon_env_builder.use_https_safekeeper_api = True + env = neon_env_builder.init_start() + + sk = env.safekeepers[0] + + # 1. Make simple https request. + addr = f"https://localhost:{sk.port.https}/v1/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + # Note: http_port is intentionally wrong. + # Storcon should not use it if use_https is on. + http_port = 0 + + body = { + "active": True, + "id": sk.id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "localhost", + "port": sk.port.pg, + "http_port": http_port, + "https_port": None, + "version": 5957, + "availability_zone_id": "us-east-2b", + } + # 2. Try register with https port missing. + with pytest.raises(StorageControllerApiException, match="https port is not specified"): + env.storage_controller.on_safekeeper_deploy(sk.id, body) + + # 3. Register with https port. + body["https_port"] = sk.port.https + env.storage_controller.on_safekeeper_deploy(sk.id, body) + + # 4. Wait for hearbeat round complete. + def storcon_heartbeat(): + assert env.storage_controller.log_contains( + "Heartbeat round complete for 1 safekeepers, 0 offline" + ) + + wait_until(storcon_heartbeat) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 29919f2fe7..05eb4301b0 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -605,7 +605,7 @@ def test_storage_controller_compute_hook( # when migrating. neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" # Set up fake HTTP notify endpoint notifications = [] @@ -618,7 +618,7 @@ def test_storage_controller_compute_hook( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -724,7 +724,7 @@ def test_storage_controller_stuck_compute_hook( neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" handle_params = {"status": 200} @@ -736,7 +736,7 @@ def test_storage_controller_stuck_compute_hook( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -871,7 +871,7 @@ def test_storage_controller_compute_hook_retry( neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" handle_params = {"status": 200} @@ -883,7 +883,7 @@ def test_storage_controller_compute_hook_retry( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_configs() @@ -993,7 +993,7 @@ def test_storage_controller_compute_hook_revert( # when migrating. neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" # Set up fake HTTP notify endpoint notifications = [] @@ -1006,7 +1006,7 @@ def test_storage_controller_compute_hook_revert( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -1395,9 +1395,7 @@ def test_storage_controller_tenant_deletion( """ neon_env_builder.num_pageservers = 4 neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api env = neon_env_builder.init_configs() env.start() @@ -1749,18 +1747,23 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): # Restart the failed pageserver victim_ps.start() + env.storage_controller.reconcile_until_idle() + # We expect that the re-attach call correctly tipped off the pageserver that its locations # are all secondaries now. locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"] assert len(locations) == 2 assert all(loc[1]["mode"] == "Secondary" for loc in locations) - # We expect that this situation resulted from the re_attach call, and not any explicit - # Reconciler runs: assert that the reconciliation count has not gone up since we restarted. + # We expect that this situation resulted from background reconciliations + # Reconciler runs: assert that the reconciliation count has gone up by exactly + # one for each shard reconciles_after_restart = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) - assert reconciles_after_restart == reconciles_before_restart + + assert reconciles_before_restart is not None + assert reconciles_after_restart == reconciles_before_restart + 2 def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 713f89c60f..81e727a3aa 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -436,7 +436,7 @@ def test_single_branch_get_tenant_size_grows( # when our tenant is configured with a tiny pitr interval, dropping a table should # cause synthetic size to go down immediately tenant_config["pitr_interval"] = "0s" - env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config) + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, tenant_config) (current_lsn, size) = get_current_consistent_size( env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index c17840d31c..2bad0bb671 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -42,6 +42,17 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): # If we run the unsharded version, talk to the storage controller ps_http = env.storage_controller.pageserver_api() + for ps in env.pageservers: + # We make /archival_config requests that are intended to fail. + # It's expected that storcon drops requests to other pageservers after + # it gets the first error (https://github.com/neondatabase/neon/issues/11177) + ps.allowed_errors.extend( + [ + ".*WARN.* path=/v1/tenant/.*/archival_config .*request was dropped before completing", + ".*ERROR.* path=/v1/tenant/.*/archival_config .*Cancelled request finished with an error.*", + ] + ) + # first try to archive a non existing timeline for an existing tenant: invalid_timeline_id = TimelineId.generate() with pytest.raises(PageserverApiException, match="timeline not found") as exc: diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 612a767480..96664f2b8d 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -319,8 +319,9 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): # this does not contain Z in the end, so fromisoformat accepts it # it is to be in line with the deletion timestamp.. well, almost. when = original_ancestor[2][:26] - when_ts = datetime.datetime.fromisoformat(when) - assert when_ts < datetime.datetime.now() + when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC) + now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC) + assert when_ts < now assert len(lineage.get("reparenting_history", [])) == 0 elif expected_ancestor == timeline_id: assert len(lineage.get("original_ancestor", [])) == 0 @@ -342,6 +343,140 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) +def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): + """ + Test the v2 behavior of ancestor detach. + + old main -------|---------X---------> + | | | + | | +-> after + | +--X empty snapshot branch + | | + | +-> branch-to-detach + | + +-> earlier + + Ends up as: + + old main -------|---------X---------> + | | | + | | +-> after + | +--X empty snapshot branch + | + +-> earlier + + + new main -------|---------|----> branch-to-detach + """ + + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + + branchpoint_pipe = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + earlier = env.create_branch( + "earlier", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_pipe + ) + + snapshot_branchpoint = env.create_branch( + "snapshot_branchpoint", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_x + ) + + branch_to_detach = env.create_branch( + "branch_to_detach", + ancestor_branch_name="snapshot_branchpoint", + ancestor_start_lsn=branchpoint_x, + ) + + after = env.create_branch("after", ancestor_branch_name="main", ancestor_start_lsn=None) + + all_reparented = client.detach_ancestor( + env.initial_tenant, branch_to_detach, detach_behavior="v2" + ) + assert set(all_reparented) == set() + + env.pageserver.quiesce_tenants() + + # checking the ancestor after is much faster than waiting for the endpoint not start + expected_result = [ + ("main", env.initial_timeline, None, 16384, 1), + ("after", after, env.initial_timeline, 16384, 1), + ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 8192, 1), + ("branch_to_detach", branch_to_detach, None, 8192, 1), + ("earlier", earlier, env.initial_timeline, 0, 1), + ] + + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + for branch_name, queried_timeline, expected_ancestor, _, _ in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) + ancestor_timeline_id = details["ancestor_timeline_id"] + if expected_ancestor is None: + assert ancestor_timeline_id is None + else: + assert ( + TimelineId(ancestor_timeline_id) == expected_ancestor + ), f"when checking branch {branch_name}, mapping={expected_result}" + + index_part = env.pageserver_remote_storage.index_content( + env.initial_tenant, queried_timeline + ) + lineage = index_part["lineage"] + assert lineage is not None + + assert lineage.get("reparenting_history_overflown", "false") == "false" + + if queried_timeline == branch_to_detach: + original_ancestor = lineage["original_ancestor"] + assert original_ancestor is not None + assert original_ancestor[0] == str(env.initial_timeline) + assert original_ancestor[1] == str(branchpoint_x) + + # this does not contain Z in the end, so fromisoformat accepts it + # it is to be in line with the deletion timestamp.. well, almost. + when = original_ancestor[2][:26] + when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC) + now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC) + assert when_ts < now + assert len(lineage.get("reparenting_history", [])) == 0 + elif expected_ancestor == branch_to_detach: + assert len(lineage.get("original_ancestor", [])) == 0 + assert lineage["reparenting_history"] == [str(env.initial_timeline)] + else: + assert len(lineage.get("original_ancestor", [])) == 0 + assert len(lineage.get("reparenting_history", [])) == 0 + + for name, _, _, rows, starts in expected_result: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1 + + # delete the new timeline to confirm it doesn't carry over the anything from the old timeline + client.timeline_delete(env.initial_tenant, branch_to_detach) + wait_timeline_detail_404(client, env.initial_tenant, branch_to_detach) + + # delete the after timeline + client.timeline_delete(env.initial_tenant, after) + wait_timeline_detail_404(client, env.initial_tenant, after) + + def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): """ Makes sure that the timeline is able to receive writes through-out the detach process. @@ -677,11 +812,13 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard for ps in pageservers.values(): ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + # We make /detach_ancestor requests that are intended to fail. + # It's expected that storcon drops requests to other pageservers after + # it gets the first error (https://github.com/neondatabase/neon/issues/11177) ps.allowed_errors.extend( [ ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing", - # rare error logging, which is hard to reproduce without instrumenting responding with random sleep - '.* ERROR .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: Cancelled request finished with an error: Conflict\\("no ancestors"\\)', + ".* ERROR .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: Cancelled request finished with an error.*", ] ) @@ -1217,8 +1354,10 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv ) +@pytest.mark.parametrize("detach_behavior", ["default", "v1", "v2"]) def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( neon_env_builder: NeonEnvBuilder, + detach_behavior: str, ): shard_count = 2 neon_env_builder.num_pageservers = shard_count @@ -1257,7 +1396,11 @@ def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( victim_http.configure_failpoints([(pausepoint, "pause"), (failpoint, "return")]) def detach_timeline(): - http.detach_ancestor(env.initial_tenant, detached_branch) + http.detach_ancestor( + env.initial_tenant, + detached_branch, + detach_behavior=detach_behavior if detach_behavior != "default" else None, + ) def paused_at_failpoint(): stuck.assert_log_contains(f"at failpoint {pausepoint}") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 0366e88389..89c4a96499 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1427,6 +1427,7 @@ class SafekeeperEnv: pg=self.port_distributor.get_port(), pg_tenant_only=self.port_distributor.get_port(), http=self.port_distributor.get_port(), + https=None, ) safekeeper_dir = self.repo_dir / f"sk{i}" @@ -2038,6 +2039,29 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") +def test_explicit_timeline_creation_storcon(neon_env_builder: NeonEnvBuilder): + """ + Test that having neon.safekeepers starting with g#n: with non zero n enables + generations, which as a side effect disables automatic timeline creation. + Like test_explicit_timeline_creation, but asks the storcon to + create membership conf & timeline. + """ + neon_env_builder.num_safekeepers = 3 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + } + env = neon_env_builder.init_start() + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + ep = env.endpoints.create("main", config_lines=config_lines) + + # endpoint should start. + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") + + # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # when compute is active, but there are no writes to the timeline. In that case # pageserver should maintain a single connection to safekeeper and don't attempt diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 4a6ab6e745..e9eaf4b35e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -16,6 +16,7 @@ license.workspace = true ### BEGIN HAKARI SECTION [dependencies] ahash = { version = "0.8" } +anstream = { version = "0.6" } anyhow = { version = "1", features = ["backtrace"] } base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } base64-647d43efb71741da = { package = "base64", version = "0.21" } @@ -25,11 +26,16 @@ camino = { version = "1", default-features = false, features = ["serde1"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } clap = { version = "4", features = ["derive", "env", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] } +const-oid = { version = "0.9", default-features = false, features = ["db", "std"] } crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] } -der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] } +der = { version = "0.7", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] } deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] } digest = { version = "0.10", features = ["mac", "oid", "std"] } +ecdsa = { version = "0.16", features = ["pem", "signing", "std", "verifying"] } either = { version = "1" } +elliptic-curve = { version = "0.13", default-features = false, features = ["digest", "hazmat", "jwk", "pem", "std"] } +env_filter = { version = "0.1", default-features = false, features = ["regex"] } +env_logger = { version = "0.11" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } form_urlencoded = { version = "1" } futures-channel = { version = "0.3", features = ["sink"] } @@ -47,8 +53,7 @@ hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } -itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12" } -itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } +itertools = { version = "0.12" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -56,13 +61,14 @@ memchr = { version = "2" } nix = { version = "0.26" } nom = { version = "7" } num = { version = "0.4" } -num-bigint = { version = "0.4" } +num-bigint = { version = "0.4", default-features = false, features = ["std"] } num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } +p256 = { version = "0.13", features = ["jwk"] } parquet = { version = "53", default-features = false, features = ["zstd"] } prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } @@ -72,6 +78,7 @@ regex-syntax = { version = "0.8" } reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "rustls-tls-native-roots", "stream"] } rustls = { version = "0.23", default-features = false, features = ["logging", "ring", "std", "tls12"] } scopeguard = { version = "1" } +sec1 = { version = "0.7", features = ["pem", "serde", "std", "subtle"] } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["alloc", "raw_value"] } sha2 = { version = "0.10", features = ["asm", "oid"] } @@ -108,21 +115,19 @@ anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } -displaydoc = { version = "0.2" } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } -itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12" } -itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } +itertools = { version = "0.12" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } num = { version = "0.4" } -num-bigint = { version = "0.4" } +num-bigint = { version = "0.4", default-features = false, features = ["std"] } num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }