Compare commits

..

10 Commits

Author SHA1 Message Date
Vlad Lazar
c4ce5dda5d tests: enable sampling with zero ration in tput tests 2025-03-07 17:47:50 +01:00
Vlad Lazar
74c555fd06 pageserver: add get page perf tracing
Sampling is done in page service after reading the request from the
wire. A completely separate span hierarchy is used for perf tracing.
The spans live in the `RequestContext` and span relationships are
expressed via the APIs exposed by `RequestContext`.
2025-03-07 17:47:50 +01:00
Vlad Lazar
716bb3c361 pageserver: thread otel dispatch into connection req context 2025-03-07 17:47:50 +01:00
Vlad Lazar
b90b945c02 pageserver: add perf span utilities to request context 2025-03-07 17:47:46 +01:00
Vlad Lazar
ac1159cb33 utils: allow for setting up OTEL tracing subscriber 2025-03-07 17:44:05 +01:00
Vlad Lazar
74a7f68da0 pageserver: add new runtime for performance spans 2025-03-07 17:44:05 +01:00
Vlad Lazar
2acca9fe33 tracing-utils: add perf span tracking utilities 2025-03-07 17:44:05 +01:00
Vlad Lazar
a2b9ff0d40 tracing-utils: allow for explicit export config 2025-03-07 17:44:05 +01:00
Vlad Lazar
a7f60dd5d0 pageserver: add tracing configuration knobs 2025-03-07 17:44:05 +01:00
Vlad Lazar
2a1cbca4e5 pageserver: clean up request context api
This commit removes `RequestContextBuilder::extend`.
Instead, callers should create attached children via
`RequestContextBuilder` interface.
2025-03-07 17:43:58 +01:00
255 changed files with 3758 additions and 8950 deletions

View File

@@ -0,0 +1,21 @@
## Release 202Y-MM-DD
**NB: this PR must be merged only by 'Create a merge commit'!**
### Checklist when preparing for release
- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b)
- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?
<!-- List everything that should be done **before** release, any issues / setting changes / etc -->
### Checklist after release
- [ ] Make sure instructions from PRs included in this release and labeled `manual_release_instructions` are executed (either by you or by people who wrote them).
- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files))
- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1)
- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time)
<!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->

View File

@@ -33,5 +33,3 @@ config-variables:
- NEON_PROD_AWS_ACCOUNT_ID
- AWS_ECR_REGION
- BENCHMARK_LARGE_OLTP_PROJECTID
- SLACK_ON_CALL_DEVPROD_STREAM
- SLACK_RUST_CHANNEL_ID

View File

@@ -1,16 +1,14 @@
import itertools
import json
import os
import sys
source_tag = os.getenv("SOURCE_TAG")
target_tag = os.getenv("TARGET_TAG")
branch = os.getenv("BRANCH")
dev_acr = os.getenv("DEV_ACR")
prod_acr = os.getenv("PROD_ACR")
dev_aws = os.getenv("DEV_AWS")
prod_aws = os.getenv("PROD_AWS")
aws_region = os.getenv("AWS_REGION")
build_tag = os.environ["BUILD_TAG"]
branch = os.environ["BRANCH"]
dev_acr = os.environ["DEV_ACR"]
prod_acr = os.environ["PROD_ACR"]
dev_aws = os.environ["DEV_AWS"]
prod_aws = os.environ["PROD_AWS"]
aws_region = os.environ["AWS_REGION"]
components = {
"neon": ["neon"],
@@ -41,23 +39,24 @@ registries = {
outputs: dict[str, dict[str, list[str]]] = {}
target_tags = [target_tag, "latest"] if branch == "main" else [target_tag]
target_stages = (
["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"]
)
target_tags = [build_tag, "latest"] if branch == "main" else [build_tag]
target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"]
for component_name, component_images in components.items():
for stage in target_stages:
outputs[f"{component_name}-{stage}"] = {
f"docker.io/neondatabase/{component_image}:{source_tag}": [
f"{registry}/{component_image}:{tag}"
for registry, tag in itertools.product(registries[stage], target_tags)
if not (registry == "docker.io/neondatabase" and tag == source_tag)
outputs[f"{component_name}-{stage}"] = dict(
[
(
f"docker.io/neondatabase/{component_image}:{build_tag}",
[
f"{combo[0]}/{component_image}:{combo[1]}"
for combo in itertools.product(registries[stage], target_tags)
],
)
for component_image in component_images
]
for component_image in component_images
}
)
with open(os.getenv("GITHUB_OUTPUT", "/dev/null"), "a") as f:
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
for key, value in outputs.items():
f.write(f"{key}={json.dumps(value)}\n")
print(f"Image map for {key}:\n{json.dumps(value, indent=2)}\n\n", file=sys.stderr)

View File

@@ -1,110 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
DOCS_URL="https://docs.neon.build/overview/repositories/neon.html"
message() {
if [[ -n "${GITHUB_PR_NUMBER:-}" ]]; then
gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --edit-last --body "$1" \
|| gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --body "$1"
fi
echo "$1"
}
report_error() {
message "$1
For more details, see the documentation: ${DOCS_URL}"
exit 1
}
case "$RELEASE_BRANCH" in
"release") COMPONENT="Storage" ;;
"release-proxy") COMPONENT="Proxy" ;;
"release-compute") COMPONENT="Compute" ;;
*)
report_error "Unknown release branch: ${RELEASE_BRANCH}"
;;
esac
# Identify main and release branches
MAIN_BRANCH="origin/main"
REMOTE_RELEASE_BRANCH="origin/${RELEASE_BRANCH}"
# Find merge base
MERGE_BASE=$(git merge-base "${MAIN_BRANCH}" "${REMOTE_RELEASE_BRANCH}")
echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}"
# Get the HEAD commit (last commit in PR, expected to be the merge commit)
LAST_COMMIT=$(git rev-parse HEAD)
MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}")
EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$"
if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then
report_error "Merge commit message does not match expected pattern: '<component> release YYYY-MM-DD'
Expected component: ${COMPONENT}
Found: '${MERGE_COMMIT_MESSAGE}'"
fi
echo "✅ Merge commit message is correctly formatted: '${MERGE_COMMIT_MESSAGE}'"
LAST_COMMIT_PARENTS=$(git cat-file -p "${LAST_COMMIT}" | jq -sR '[capture("parent (?<parent>[0-9a-f]{40})"; "g") | .parent]')
if [[ "$(echo "${LAST_COMMIT_PARENTS}" | jq 'length')" -ne 2 ]]; then
report_error "Last commit must be a merge commit with exactly two parents"
fi
EXPECTED_RELEASE_HEAD=$(git rev-parse "${REMOTE_RELEASE_BRANCH}")
if echo "${LAST_COMMIT_PARENTS}" | jq -e --arg rel "${EXPECTED_RELEASE_HEAD}" 'index($rel) != null' > /dev/null; then
LINEAR_HEAD=$(echo "${LAST_COMMIT_PARENTS}" | jq -r '[.[] | select(. != $rel)][0]' --arg rel "${EXPECTED_RELEASE_HEAD}")
else
report_error "Last commit must merge the release branch (${RELEASE_BRANCH})"
fi
echo "✅ Last commit correctly merges the previous commit and the release branch"
echo "Top commit of linear history: ${LINEAR_HEAD}"
MERGE_COMMIT_TREE=$(git rev-parse "${LAST_COMMIT}^{tree}")
LINEAR_HEAD_TREE=$(git rev-parse "${LINEAR_HEAD}^{tree}")
if [[ "${MERGE_COMMIT_TREE}" != "${LINEAR_HEAD_TREE}" ]]; then
report_error "Tree of merge commit (${MERGE_COMMIT_TREE}) does not match tree of linear history head (${LINEAR_HEAD_TREE})
This indicates that the merge of ${RELEASE_BRANCH} into this branch was not performed using the merge strategy 'ours'"
fi
echo "✅ Merge commit tree matches the linear history head"
EXPECTED_PREVIOUS_COMMIT="${LINEAR_HEAD}"
# Now traverse down the history, ensuring each commit has exactly one parent
CURRENT_COMMIT="${EXPECTED_PREVIOUS_COMMIT}"
while [[ "${CURRENT_COMMIT}" != "${MERGE_BASE}" && "${CURRENT_COMMIT}" != "${EXPECTED_RELEASE_HEAD}" ]]; do
CURRENT_COMMIT_PARENTS=$(git cat-file -p "${CURRENT_COMMIT}" | jq -sR '[capture("parent (?<parent>[0-9a-f]{40})"; "g") | .parent]')
if [[ "$(echo "${CURRENT_COMMIT_PARENTS}" | jq 'length')" -ne 1 ]]; then
report_error "Commit ${CURRENT_COMMIT} must have exactly one parent"
fi
NEXT_COMMIT=$(echo "${CURRENT_COMMIT_PARENTS}" | jq -r '.[0]')
if [[ "${NEXT_COMMIT}" == "${MERGE_BASE}" ]]; then
echo "✅ Reached merge base (${MERGE_BASE})"
PR_BASE="${MERGE_BASE}"
elif [[ "${NEXT_COMMIT}" == "${EXPECTED_RELEASE_HEAD}" ]]; then
echo "✅ Reached release branch (${EXPECTED_RELEASE_HEAD})"
PR_BASE="${EXPECTED_RELEASE_HEAD}"
elif [[ -z "${NEXT_COMMIT}" ]]; then
report_error "Unexpected end of commit history before reaching merge base"
fi
# Move to the next commit in the chain
CURRENT_COMMIT="${NEXT_COMMIT}"
done
echo "✅ All commits are properly ordered and linear"
echo "✅ Release PR structure is valid"
echo
message "Commits that are part of this release:
$(git log --oneline "${PR_BASE}..${LINEAR_HEAD}")"

View File

@@ -17,12 +17,6 @@
({};
.[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end))
# Ensure that each component exists, or fail
| (["storage", "compute", "proxy"] - (keys)) as $missing
| if ($missing | length) > 0 then
"Error: Found no release for \($missing | join(", "))!\n" | halt_error(1)
else . end
# Convert the resulting object into an array of formatted strings
| to_entries
| map("\(.key)=\(.value.full)")

View File

@@ -7,8 +7,8 @@ on:
description: 'Component name'
required: true
type: string
source-branch:
description: 'Source branch'
release-branch:
description: 'Release branch'
required: true
type: string
secrets:
@@ -30,25 +30,17 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.source-branch }}
fetch-depth: 0
ref: main
- name: Set variables
id: vars
env:
COMPONENT_NAME: ${{ inputs.component-name }}
RELEASE_BRANCH: >-
${{
false
|| inputs.component-name == 'Storage' && 'release'
|| inputs.component-name == 'Proxy' && 'release-proxy'
|| inputs.component-name == 'Compute' && 'release-compute'
}}
RELEASE_BRANCH: ${{ inputs.release-branch }}
run: |
today=$(date +'%Y-%m-%d')
echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT}
echo "release-branch=${RELEASE_BRANCH}" | tee -a ${GITHUB_OUTPUT}
- name: Configure git
run: |
@@ -57,36 +49,31 @@ jobs:
- name: Create RC branch
env:
RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
TITLE: ${{ steps.vars.outputs.title }}
run: |
git switch -c "${RC_BRANCH}"
git checkout -b "${RC_BRANCH}"
# Manually create a merge commit on the current branch, keeping the
# tree and setting the parents to the current HEAD and the HEAD of the
# release branch. This commit is what we'll fast-forward the release
# branch to when merging the release branch.
# For details on why, look at
# https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs
current_tree=$(git rev-parse 'HEAD^{tree}')
release_head=$(git rev-parse "origin/${RELEASE_BRANCH}")
current_head=$(git rev-parse HEAD)
merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}")
# Fast-forward the current branch to the newly created merge_commit
git merge --ff-only ${merge_commit}
# create an empty commit to distinguish workflow runs
# from other possible releases from the same commit
git commit --allow-empty -m "${TITLE}"
git push origin "${RC_BRANCH}"
- name: Create a PR into ${{ steps.vars.outputs.release-branch }}
- name: Create a PR into ${{ inputs.release-branch }}
env:
GH_TOKEN: ${{ secrets.ci-access-token }}
RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
RELEASE_BRANCH: ${{ inputs.release-branch }}
TITLE: ${{ steps.vars.outputs.title }}
run: |
cat << EOF > body.md
## ${TITLE}
**Please merge this Pull Request using 'Create a merge commit' button**
EOF
gh pr create --title "${TITLE}" \
--body "" \
--body-file "body.md" \
--head "${RC_BRANCH}" \
--base "${RELEASE_BRANCH}"

View File

@@ -19,18 +19,11 @@ on:
description: "Tag of the last compute release"
value: ${{ jobs.tags.outputs.compute }}
run-kind:
description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`"
description: "The kind of run we're currently in. Will be one of `pr`, `push-main`, `storage-rc`, `storage-release`, `proxy-rc`, `proxy-release`, `compute-rc`, `compute-release` or `merge_queue`"
value: ${{ jobs.tags.outputs.run-kind }}
release-pr-run-id:
description: "Only available if `run-kind in [storage-release, proxy-release, compute-release]`. Contains the run ID of the `Build and Test` workflow, assuming one with the current commit can be found."
value: ${{ jobs.tags.outputs.release-pr-run-id }}
permissions: {}
defaults:
run:
shell: bash -euo pipefail {0}
jobs:
tags:
runs-on: ubuntu-22.04
@@ -40,7 +33,6 @@ jobs:
proxy: ${{ steps.previous-releases.outputs.proxy }}
storage: ${{ steps.previous-releases.outputs.storage }}
run-kind: ${{ steps.run-kind.outputs.run-kind }}
release-pr-run-id: ${{ steps.release-pr-run-id.outputs.release-pr-run-id }}
permissions:
contents: read
steps:
@@ -63,7 +55,6 @@ jobs:
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr'
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr'
|| (inputs.github-event-name == 'pull_request') && 'pr'
|| (inputs.github-event-name == 'workflow_dispatch') && 'workflow-dispatch'
|| 'unknown'
}}
run: |
@@ -91,16 +82,9 @@ jobs:
echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
;;
pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr)
BUILD_AND_TEST_RUN_ID=$(gh api --paginate \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=${CURRENT_SHA}&branch=${CURRENT_BRANCH}" \
| jq '[.workflow_runs[] | select(.name == "Build and Test")][0].id // ("Error: No matching workflow run found." | halt_error(1))')
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
;;
workflow-dispatch)
echo "tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT
;;
*)
echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!"
exit 1
@@ -117,13 +101,3 @@ jobs:
"/repos/${GITHUB_REPOSITORY}/releases" \
| jq -f .github/scripts/previous-releases.jq -r \
| tee -a "${GITHUB_OUTPUT}"
- name: Get the release PR run ID
id: release-pr-run-id
if: ${{ contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from RC PR!" | halt_error(1))')
echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT

View File

@@ -476,7 +476,7 @@ jobs:
(
!github.event.pull_request.draft
|| contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft')
|| needs.meta.outputs.run-kind == 'push-main'
|| contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind)
) && !failure() && !cancelled()
}}
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ]
@@ -487,7 +487,7 @@ jobs:
neon-image-arch:
needs: [ check-permissions, build-build-tools-image, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
strategy:
matrix:
arch: [ x64, arm64 ]
@@ -537,7 +537,7 @@ jobs:
neon-image:
needs: [ neon-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ubuntu-22.04
permissions:
id-token: write # aws-actions/configure-aws-credentials
@@ -559,7 +559,7 @@ jobs:
compute-node-image-arch:
needs: [ check-permissions, build-build-tools-image, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -651,7 +651,7 @@ jobs:
compute-node-image:
needs: [ compute-node-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -694,7 +694,7 @@ jobs:
vm-compute-node-image-arch:
needs: [ check-permissions, meta, compute-node-image ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
strategy:
fail-fast: false
@@ -747,7 +747,7 @@ jobs:
vm-compute-node-image:
needs: [ vm-compute-node-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ubuntu-22.04
strategy:
matrix:
@@ -773,12 +773,7 @@ jobs:
test-images:
needs: [ check-permissions, meta, neon-image, compute-node-image ]
# Depends on jobs that can get skipped
if: >-
${{
!failure()
&& !cancelled()
&& contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)
}}
if: "!failure() && !cancelled()"
strategy:
fail-fast: false
matrix:
@@ -805,7 +800,7 @@ jobs:
# Ensure that we don't have bad versions.
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
run: |
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
@@ -826,19 +821,19 @@ jobs:
env:
TAG: >-
${{
needs.meta.outputs.run-kind == 'compute-rc-pr'
contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.previous-storage-release
|| needs.meta.outputs.build-tag
}}
COMPUTE_TAG: >-
${{
contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.previous-compute-release
|| needs.meta.outputs.build-tag
}}
TEST_EXTENSIONS_TAG: >-
${{
contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
&& 'latest'
|| needs.meta.outputs.build-tag
}}
@@ -890,13 +885,7 @@ jobs:
id: generate
run: python3 .github/scripts/generate_image_maps.py
env:
SOURCE_TAG: >-
${{
contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.release-pr-run-id
|| needs.meta.outputs.build-tag
}}
TARGET_TAG: ${{ needs.meta.outputs.build-tag }}
BUILD_TAG: "${{ needs.meta.outputs.build-tag }}"
BRANCH: "${{ github.ref_name }}"
DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
@@ -906,7 +895,7 @@ jobs:
push-neon-image-dev:
needs: [ meta, generate-image-maps, neon-image ]
if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
uses: ./.github/workflows/_push-to-container-registry.yml
permissions:
id-token: write # Required for aws/azure login
@@ -924,7 +913,7 @@ jobs:
push-compute-image-dev:
needs: [ meta, generate-image-maps, vm-compute-node-image ]
if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
uses: ./.github/workflows/_push-to-container-registry.yml
permissions:
id-token: write # Required for aws/azure login
@@ -978,55 +967,16 @@ jobs:
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
secrets: inherit
push-neon-test-extensions-image-ghcr:
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
# This is a bit of a special case so we're not using a generated image map.
add-latest-tag-to-neon-extensions-test-image:
if: github.ref_name == 'main'
needs: [ meta, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [
"ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}"
],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [
"ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}"
]
}
secrets: inherit
add-latest-tag-to-neon-test-extensions-image:
if: ${{ needs.meta.outputs.run-kind == 'push-main' }}
needs: [ meta, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [
"docker.io/neondatabase/neon-test-extensions-v16:latest",
"ghcr.io/neondatabase/neon-test-extensions-v16:latest"
],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [
"docker.io/neondatabase/neon-test-extensions-v17:latest",
"ghcr.io/neondatabase/neon-test-extensions-v17:latest"
]
}
secrets: inherit
add-release-tag-to-neon-test-extensions-image:
if: ${{ needs.meta.outputs.run-kind == 'compute-release' }}
needs: [ meta, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.release-pr-run-id }}": [
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}",
"ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}"
],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.release-pr-run-id }}": [
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}",
"ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}"
]
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
}
secrets: inherit
@@ -1225,7 +1175,7 @@ jobs:
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \
-f deployStorageBroker=false \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}} \
@@ -1233,7 +1183,7 @@ jobs:
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
-f deployStorage=true \
-f deployStorageBroker=false \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}}
@@ -1281,11 +1231,11 @@ jobs:
payload: |
channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
text: |
🔴 <!subteam^S06CJ87UMNY|@oncall-storage>: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
promote-compatibility-data:
needs: [ meta, deploy ]
needs: [ deploy ]
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -1295,6 +1245,37 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
id: fetch-last-release-pr-info
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
branch_name_and_pr_number=$(gh pr list \
--repo "${GITHUB_REPOSITORY}" \
--base release \
--state merged \
--limit 10 \
--json mergeCommit,headRefName,number \
--jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
run_id=$(gh run list \
--repo "${GITHUB_REPOSITORY}" \
--workflow build_and_test.yml \
--branch "${branch_name}" \
--json databaseId \
--limit 1 \
--jq '.[].databaseId')
last_commit_sha=$(gh pr view "${pr_number}" \
--repo "${GITHUB_REPOSITORY}" \
--json commits \
--jq '.commits[-1].oid')
echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
- uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
@@ -1305,8 +1286,8 @@ jobs:
env:
BUCKET: neon-github-public-dev
AWS_REGION: eu-central-1
COMMIT_SHA: ${{ github.sha }}
RUN_ID: ${{ needs.meta.outputs.release-pr-run-id }}
COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
run: |
old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
new_prefix="artifacts/latest"
@@ -1395,5 +1376,5 @@ jobs:
|| needs.files-changed.result == 'skipped'
|| (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|| (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind))
|| (needs.test-images.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|| needs.test-images.result == 'skipped'
|| (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))

View File

@@ -7,7 +7,7 @@ on:
required: false
type: string
schedule:
- cron: '0 10 * * *'
- cron: '0 0 * * *'
jobs:
cargo-deny:
@@ -50,9 +50,8 @@ jobs:
method: chat.postMessage
token: ${{ secrets.SLACK_BOT_TOKEN }}
payload: |
channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
text: |
Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
Fixing the problem should be fairly straight forward from the logs. If not, <#${{ vars.SLACK_RUST_CHANNEL_ID }}> is there to help.
Pinging <!subteam^S0838JPSH32|@oncall-devprod>.
Pinging @oncall-devprod.

View File

@@ -1,36 +0,0 @@
name: Fast forward merge
on:
pull_request:
types: [labeled]
branches:
- release
- release-proxy
- release-compute
jobs:
fast-forward:
if: ${{ github.event.label.name == 'fast-forward' }}
runs-on: ubuntu-22.04
steps:
- name: Remove fast-forward label to PR
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
gh pr edit ${{ github.event.pull_request.number }} --repo "${GITHUB_REPOSITORY}" --remove-label "fast-forward"
- name: Fast forwarding
uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979
# See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
if: ${{ github.event.pull_request.mergeable_state == 'clean' }}
with:
merge: true
comment: on-error
github_token: ${{ secrets.CI_ACCESS_TOKEN }}
- name: Comment if mergeable_state is not clean
if: ${{ github.event.pull_request.mergeable_state != 'clean' }}
run: |
gh pr comment ${{ github.event.pull_request.number }} \
--repo "${GITHUB_REPOSITORY}" \
--body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`."

View File

@@ -2,8 +2,8 @@ name: large oltp benchmark
on:
# uncomment to run on push for debugging your PR
#push:
# branches: [ bodobolero/synthetic_oltp_workload ]
push:
branches: [ bodobolero/synthetic_oltp_workload ]
schedule:
# * is a special character in YAML so you have to quote this string
@@ -12,7 +12,7 @@ on:
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 15 * * 0,2,4' # run on Sunday, Tuesday, Thursday at 3 PM UTC
- cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
workflow_dispatch: # adds ability to run this manually
defaults:
@@ -22,7 +22,7 @@ defaults:
concurrency:
# Allow only one workflow globally because we need dedicated resources which only exist once
group: large-oltp-bench-workflow
cancel-in-progress: false
cancel-in-progress: true
jobs:
oltp:
@@ -31,9 +31,9 @@ jobs:
matrix:
include:
- target: new_branch
custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
- target: reuse_branch
custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
permissions:
contents: write
@@ -46,6 +46,7 @@ jobs:
PG_VERSION: 16 # pre-determined by pre-determined project
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
PLATFORM: ${{ matrix.target }}
runs-on: [ self-hosted, us-east-2, x64 ]
@@ -56,10 +57,8 @@ jobs:
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
# Increase timeout to 2 days, default timeout is 6h - database maintenance can take a long time
# (normally 1h pgbench, 3h vacuum analyze 3.5h re-index) x 2 = 15h, leave some buffer for regressions
# in one run vacuum didn't finish within 12 hours
timeout-minutes: 2880
# Increase timeout to 8h, default timeout is 6h
timeout-minutes: 480
steps:
- uses: actions/checkout@v4
@@ -90,45 +89,29 @@ jobs:
- name: Set up Connection String
id: set-up-connstr
run: |
case "${{ matrix.target }}" in
new_branch)
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
;;
reuse_branch)
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
;;
*)
echo >&2 "Unknown target=${{ matrix.target }}"
exit 1
;;
esac
case "${{ matrix.target }}" in
new_branch)
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
;;
reuse_branch)
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
;;
*)
echo >&2 "Unknown target=${{ matrix.target }}"
exit 1
;;
esac
CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}"
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT
- name: Delete rows from prior runs in reuse branch
if: ${{ matrix.target == 'reuse_branch' }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }}
PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config
PSQL: /tmp/neon/pg_install/v16/bin/psql
PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib
run: |
echo "$(date '+%Y-%m-%d %H:%M:%S') - Deleting rows in table webhook.incoming_webhooks from prior runs"
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
${PSQL} "${BENCHMARK_CONNSTR}" -c "SET statement_timeout = 0; DELETE FROM webhook.incoming_webhooks WHERE created_at > '2025-02-27 23:59:59+00';"
echo "$(date '+%Y-%m-%d %H:%M:%S') - Finished deleting rows in table webhook.incoming_webhooks from prior runs"
- name: Benchmark pgbench with custom-scripts
- name: Benchmark pgbench with custom-scripts
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: true
extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_pgbench
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
pg_version: ${{ env.PG_VERSION }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
@@ -136,21 +119,6 @@ jobs:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Benchmark database maintenance
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: true
extra_params: -m remote_cluster --timeout 172800 -k test_perf_oltp_large_tenant_maintenance
pg_version: ${{ env.PG_VERSION }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Delete Neon Branch for large tenant
if: ${{ always() && matrix.target == 'new_branch' }}
uses: ./.github/actions/neon-branch-delete
@@ -159,13 +127,6 @@ jobs:
branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Configure AWS credentials # again because prior steps could have exceeded 5 hours
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}

View File

@@ -1,24 +0,0 @@
name: Lint Release PR
on:
pull_request:
branches:
- release
- release-proxy
- release-compute
jobs:
lint-release-pr:
runs-on: ubuntu-22.04
steps:
- name: Checkout PR branch
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch full history for git operations
ref: ${{ github.event.pull_request.head.ref }}
- name: Run lint script
env:
RELEASE_BRANCH: ${{ github.base_ref }}
run: |
./.github/scripts/lint-release-pr.sh

View File

@@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# ┌───────────── hour (0 - 23)
# │ ┌───────────── day of the month (1 - 31)
# │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 */3 * * *' # Runs every 3 hours
# ┌───────────── minute (0 - 59)
# ┌───────────── hour (0 - 23)
# │ ┌───────────── day of the month (1 - 31)
# │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
workflow_dispatch: # Allows manual triggering of the workflow
inputs:
commit_hash:

View File

@@ -8,6 +8,8 @@ on:
- .github/workflows/build-build-tools-image.yml
- .github/workflows/pre-merge-checks.yml
merge_group:
branches:
- main
defaults:
run:
@@ -17,17 +19,15 @@ defaults:
permissions: {}
jobs:
meta:
get-changed-files:
runs-on: ubuntu-22.04
outputs:
python-changed: ${{ steps.python-src.outputs.any_changed }}
rust-changed: ${{ steps.rust-src.outputs.any_changed }}
branch: ${{ steps.group-metadata.outputs.branch }}
pr-number: ${{ steps.group-metadata.outputs.pr-number }}
steps:
- uses: actions/checkout@v4
- uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
id: python-src
with:
files: |
@@ -38,7 +38,7 @@ jobs:
poetry.lock
pyproject.toml
- uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
id: rust-src
with:
files: |
@@ -58,20 +58,12 @@ jobs:
echo "${PYTHON_CHANGED_FILES}"
echo "${RUST_CHANGED_FILES}"
- name: Merge group metadata
if: ${{ github.event_name == 'merge_group' }}
id: group-metadata
env:
MERGE_QUEUE_REF: ${{ github.event.merge_group.head_ref }}
run: |
echo $MERGE_QUEUE_REF | jq -Rr 'capture("refs/heads/gh-readonly-queue/(?<branch>.*)/pr-(?<pr_number>[0-9]+)-[0-9a-f]{40}") | ["branch=" + .branch, "pr-number=" + .pr_number] | .[]' | tee -a "${GITHUB_OUTPUT}"
build-build-tools-image:
if: |
false
|| needs.meta.outputs.python-changed == 'true'
|| needs.meta.outputs.rust-changed == 'true'
needs: [ meta ]
|| needs.get-changed-files.outputs.python-changed == 'true'
|| needs.get-changed-files.outputs.rust-changed == 'true'
needs: [ get-changed-files ]
uses: ./.github/workflows/build-build-tools-image.yml
with:
# Build only one combination to save time
@@ -80,8 +72,8 @@ jobs:
secrets: inherit
check-codestyle-python:
if: needs.meta.outputs.python-changed == 'true'
needs: [ meta, build-build-tools-image ]
if: needs.get-changed-files.outputs.python-changed == 'true'
needs: [ get-changed-files, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-python.yml
with:
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
@@ -89,8 +81,8 @@ jobs:
secrets: inherit
check-codestyle-rust:
if: needs.meta.outputs.rust-changed == 'true'
needs: [ meta, build-build-tools-image ]
if: needs.get-changed-files.outputs.rust-changed == 'true'
needs: [ get-changed-files, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-rust.yml
with:
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
@@ -109,7 +101,7 @@ jobs:
statuses: write # for `github.repos.createCommitStatus(...)`
contents: write
needs:
- meta
- get-changed-files
- check-codestyle-python
- check-codestyle-rust
runs-on: ubuntu-22.04
@@ -137,20 +129,7 @@ jobs:
run: exit 1
if: |
false
|| (github.event_name == 'merge_group' && needs.meta.outputs.branch != 'main')
|| (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.python-changed == 'true')
|| (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.rust-changed == 'true')
|| (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
|| (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true')
|| contains(needs.*.result, 'failure')
|| contains(needs.*.result, 'cancelled')
- name: Add fast-forward label to PR to trigger fast-forward merge
if: >-
${{
always()
&& github.event_name == 'merge_group'
&& contains(fromJson('["release", "release-proxy", "release-compute"]'), needs.meta.outputs.branch)
}}
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: >-
gh pr edit ${{ needs.meta.outputs.pr-number }} --repo "${GITHUB_REPOSITORY}" --add-label "fast-forward"

View File

@@ -38,7 +38,7 @@ jobs:
uses: ./.github/workflows/_create-release-pr.yml
with:
component-name: 'Storage'
source-branch: ${{ github.ref_name }}
release-branch: 'release'
secrets:
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
uses: ./.github/workflows/_create-release-pr.yml
with:
component-name: 'Proxy'
source-branch: ${{ github.ref_name }}
release-branch: 'release-proxy'
secrets:
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -64,6 +64,6 @@ jobs:
uses: ./.github/workflows/_create-release-pr.yml
with:
component-name: 'Compute'
source-branch: ${{ github.ref_name }}
release-branch: 'release-compute'
secrets:
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}

View File

@@ -1,8 +1,8 @@
# Autoscaling
/libs/vm_monitor/ @neondatabase/autoscaling
# DevProd & PerfCorr
/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness
# DevProd
/.github/ @neondatabase/developer-productivity
# Compute
/pgxn/ @neondatabase/compute

429
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -106,13 +106,13 @@ hostname = "0.4"
http = {version = "1.1.0", features = ["std"]}
http-types = { version = "2", default-features = false }
http-body-util = "0.1.2"
humantime = "2.2"
humantime = "2.1"
humantime-serde = "1.1.1"
hyper0 = { package = "hyper", version = "0.14" }
hyper = "1.4"
hyper-util = "0.1"
tokio-tungstenite = "0.21.0"
indexmap = { version = "2", features = ["serde"] }
indexmap = "2"
indoc = "2"
ipnet = "2.10.0"
itertools = "0.10"
@@ -126,9 +126,7 @@ measured = { version = "0.0.22", features=["lasso"] }
measured-process = { version = "0.0.22" }
memoffset = "0.9"
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
# Do not update to >= 7.0.0, at least. The update will have a significant impact
# on compute startup metrics (start_postgres_ms), >= 25% degradation.
notify = "6.0.0"
notify = "8.0.0"
num_cpus = "1.15"
num-traits = "0.2.15"
once_cell = "1.13"
@@ -141,7 +139,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] }
procfs = "0.16"
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
prost = "0.13"
@@ -157,7 +155,6 @@ rpds = "0.13"
rustc-hash = "1.1.0"
rustls = { version = "0.23.16", default-features = false }
rustls-pemfile = "2"
rustls-pki-types = "1.11"
scopeguard = "1.1"
sysinfo = "0.29.2"
sd-notify = "0.4.1"
@@ -221,7 +218,7 @@ zerocopy = { version = "0.7", features = ["derive"] }
json-structural-diff = { version = "0.2.0" }
## TODO replace this with tracing
env_logger = "0.11"
env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed

View File

@@ -1735,8 +1735,6 @@ RUN set -e \
libevent-dev \
libtool \
pkg-config \
libcurl4-openssl-dev \
libssl-dev \
&& apt clean && rm -rf /var/lib/apt/lists/*
# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
@@ -1745,7 +1743,7 @@ RUN set -e \
&& git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
&& cd pgbouncer \
&& ./autogen.sh \
&& ./configure --prefix=/usr/local/pgbouncer \
&& ./configure --prefix=/usr/local/pgbouncer --without-openssl \
&& make -j $(nproc) dist_man_MANS= \
&& make install dist_man_MANS=
@@ -1982,10 +1980,12 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
# rsyslog config permissions
# directory for rsyslogd pid file
RUN mkdir /var/run/rsyslogd && \
chown -R postgres:postgres /var/run/rsyslogd && \
chown -R postgres:postgres /etc/rsyslog.d/
RUN chown postgres:postgres /etc/rsyslog.conf && \
touch /etc/compute_rsyslog.conf && \
chown -R postgres:postgres /etc/compute_rsyslog.conf && \
# directory for rsyslogd pid file
mkdir /var/run/rsyslogd && \
chown -R postgres:postgres /var/run/rsyslogd
ENV LANG=en_US.utf8

View File

@@ -1,5 +1 @@
SELECT sum(pg_database_size(datname)) AS total
FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2;
SELECT sum(pg_database_size(datname)) AS total FROM pg_database;

View File

@@ -1,20 +1,10 @@
-- We export stats for 10 non-system databases. Without this limit it is too
-- easy to abuse the system by creating lots of databases.
SELECT pg_database_size(datname) AS db_size,
deadlocks,
tup_inserted AS inserted,
tup_updated AS updated,
tup_deleted AS deleted,
datname
SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
tup_updated AS updated, tup_deleted AS deleted, datname
FROM pg_stat_database
WHERE datname IN (
SELECT datname FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2
AND datname <> 'postgres'
AND NOT datistemplate
ORDER BY oid
LIMIT 10
WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
);

View File

@@ -0,0 +1,265 @@
commit 00aa659afc9c7336ab81036edec3017168aabf40
Author: Heikki Linnakangas <heikki@neon.tech>
Date: Tue Nov 12 16:59:19 2024 +0200
Temporarily disable test that depends on timezone
diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
index 23ef5fa..9e60deb 100644
--- a/ext-src/pg_anon-src/tests/expected/generalization.out
+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
(1 row)
-SELECT anon.generalize_tstzrange('19041107','millennium');
- generalize_tstzrange
------------------------------------------------------------------
- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
-(1 row)
-
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
-- generalize_daterange
SELECT anon.generalize_daterange('19041107');
generalize_daterange
diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
index b868344..b4fc977 100644
--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
SELECT anon.generalize_tstzrange('19041107','year');
SELECT anon.generalize_tstzrange('19041107','decade');
SELECT anon.generalize_tstzrange('19041107','century');
-SELECT anon.generalize_tstzrange('19041107','millennium');
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
-- generalize_daterange
SELECT anon.generalize_daterange('19041107');
commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
Author: Alexey Masterov <alexeymasterov@neon.tech>
Date: Fri May 31 06:34:26 2024 +0000
These alternative expected files were added to consider the neon features
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
new file mode 100644
index 0000000..2539cfd
--- /dev/null
+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
@@ -0,0 +1,101 @@
+BEGIN;
+CREATE EXTENSION anon CASCADE;
+NOTICE: installing required extension "pgcrypto"
+SELECT anon.init();
+ init
+------
+ t
+(1 row)
+
+CREATE ROLE mallory_the_masked_user;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
+CREATE TABLE t1(i INT);
+ALTER TABLE t1 ADD COLUMN t TEXT;
+SECURITY LABEL FOR anon ON COLUMN t1.t
+IS 'MASKED WITH VALUE NULL';
+INSERT INTO t1 VALUES (1,'test');
+--
+-- We're checking the owner's permissions
+--
+-- see
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
+--
+SET ROLE mallory_the_masked_user;
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.init();
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.anonymize_table('t1');
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+SAVEPOINT fail_start_engine;
+SELECT anon.start_dynamic_masking();
+ERROR: Only supersusers can start the dynamic masking engine.
+CONTEXT: PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
+ROLLBACK TO fail_start_engine;
+RESET ROLE;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+SET ROLE mallory_the_masked_user;
+SELECT * FROM mask.t1;
+ i | t
+---+---
+ 1 |
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ SELECT * FROM public.t1;
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+SAVEPOINT fail_stop_engine;
+SELECT anon.stop_dynamic_masking();
+ERROR: Only supersusers can stop the dynamic masking engine.
+CONTEXT: PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
+ROLLBACK TO fail_stop_engine;
+RESET ROLE;
+SELECT anon.stop_dynamic_masking();
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
+ stop_dynamic_masking
+----------------------
+ t
+(1 row)
+
+SET ROLE mallory_the_masked_user;
+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+SAVEPOINT fail_seclabel_on_role;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
+ERROR: permission denied
+DETAIL: The current user must have the CREATEROLE attribute.
+ROLLBACK TO fail_seclabel_on_role;
+ROLLBACK;
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
new file mode 100644
index 0000000..8b090fe
--- /dev/null
+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
@@ -0,0 +1,104 @@
+BEGIN;
+CREATE EXTENSION anon CASCADE;
+NOTICE: installing required extension "pgcrypto"
+SELECT anon.init();
+ init
+------
+ t
+(1 row)
+
+CREATE ROLE oscar_the_owner;
+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
+CREATE ROLE mallory_the_masked_user;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
+--
+-- We're checking the owner's permissions
+--
+-- see
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
+--
+SET ROLE oscar_the_owner;
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.init();
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+CREATE TABLE t1(i INT);
+ALTER TABLE t1 ADD COLUMN t TEXT;
+SECURITY LABEL FOR anon ON COLUMN t1.t
+IS 'MASKED WITH VALUE NULL';
+INSERT INTO t1 VALUES (1,'test');
+SELECT anon.anonymize_table('t1');
+ anonymize_table
+-----------------
+ t
+(1 row)
+
+SELECT * FROM t1;
+ i | t
+---+---
+ 1 |
+(1 row)
+
+UPDATE t1 SET t='test' WHERE i=1;
+-- SHOULD FAIL
+SAVEPOINT fail_start_engine;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+ROLLBACK TO fail_start_engine;
+RESET ROLE;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+SET ROLE oscar_the_owner;
+SELECT * FROM t1;
+ i | t
+---+------
+ 1 | test
+(1 row)
+
+--SELECT * FROM mask.t1;
+-- SHOULD FAIL
+SAVEPOINT fail_stop_engine;
+SELECT anon.stop_dynamic_masking();
+ERROR: permission denied for schema mask
+CONTEXT: SQL statement "DROP VIEW mask.t1;"
+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
+SQL statement "SELECT anon.mask_drop_view(oid)
+ FROM pg_catalog.pg_class
+ WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
+ AND relkind IN ('r','p','f')"
+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
+ROLLBACK TO fail_stop_engine;
+RESET ROLE;
+SELECT anon.stop_dynamic_masking();
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
+ stop_dynamic_masking
+----------------------
+ t
+(1 row)
+
+SET ROLE oscar_the_owner;
+-- SHOULD FAIL
+SAVEPOINT fail_seclabel_on_role;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
+ERROR: permission denied
+DETAIL: The current user must have the CREATEROLE attribute.
+ROLLBACK TO fail_seclabel_on_role;
+ROLLBACK;

View File

@@ -39,17 +39,6 @@ commands:
user: nobody
sysvInitAction: respawn
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
# Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also.
# We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to
# use a different path for the socket. The symlink actually points to our custom path.
- name: rsyslogd-socket-symlink
user: root
sysvInitAction: sysinit
shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log"
- name: rsyslogd
user: postgres
sysvInitAction: respawn
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
@@ -80,15 +69,6 @@ files:
}
memory {}
}
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
# compute_ctl will rewrite this file with the actual configuration, if needed.
- filename: compute_rsyslog.conf
content: |
# Syslock.Name specifies a non-default pipe location that is writeable for the postgres user.
module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging
*.* /dev/null
$IncludeConfig /etc/rsyslog.d/*.conf
build: |
# Build cgroup-tools
#
@@ -152,12 +132,6 @@ merge: |
RUN set -e \
&& chmod 0644 /etc/cgconfig.conf
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/

View File

@@ -39,17 +39,6 @@ commands:
user: nobody
sysvInitAction: respawn
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
# Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also.
# We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to
# use a different path for the socket. The symlink actually points to our custom path.
- name: rsyslogd-socket-symlink
user: root
sysvInitAction: sysinit
shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log"
- name: rsyslogd
user: postgres
sysvInitAction: respawn
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
@@ -80,15 +69,6 @@ files:
}
memory {}
}
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
# compute_ctl will rewrite this file with the actual configuration, if needed.
- filename: compute_rsyslog.conf
content: |
# Syslock.Name specifies a non-default pipe location that is writeable for the postgres user.
module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging
*.* /dev/null
$IncludeConfig /etc/rsyslog.d/*.conf
build: |
# Build cgroup-tools
#
@@ -148,11 +128,6 @@ merge: |
RUN set -e \
&& chmod 0644 /etc/cgconfig.conf
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/

View File

@@ -26,7 +26,6 @@ fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
indexmap.workspace = true
jsonwebtoken.workspace = true
metrics.workspace = true
nix.workspace = true
@@ -35,19 +34,16 @@ num_cpus.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true
opentelemetry_sdk.workspace = true
p256 = { version = "0.13", features = ["pem"] }
postgres.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["json"] }
ring = "0.17"
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
spki = { version = "0.7.3", features = ["std"] }
tar.workspace = true
tower.workspace = true
tower-http.workspace = true
reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-util.workspace = true
@@ -61,7 +57,6 @@ thiserror.workspace = true
url.workspace = true
uuid.workspace = true
walkdir.workspace = true
x509-cert = { version = "0.2.5" }
postgres_initdb.workspace = true
compute_api.workspace = true

View File

@@ -592,6 +592,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
utils::logging::init(
utils::logging::LogFormat::Json,
utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
utils::logging::OtelEnablement::Disabled,
utils::logging::Output::Stdout,
)?;

View File

@@ -37,14 +37,10 @@ use crate::logger::startup_context_from_env;
use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
use crate::monitor::launch_monitor;
use crate::pg_helpers::*;
use crate::rsyslog::{
PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export,
launch_pgaudit_gc,
};
use crate::rsyslog::configure_and_start_rsyslog;
use crate::spec::*;
use crate::swap::resize_swap;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
use crate::tls::watch_cert_for_changes;
use crate::{config, extension_server, local_proxy};
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
@@ -116,7 +112,6 @@ pub struct ComputeNode {
// key: ext_archive_name, value: started download time, download_completed?
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
pub compute_ctl_config: ComputeCtlConfig,
}
// store some metrics about download size that might impact startup time
@@ -140,6 +135,8 @@ pub struct ComputeState {
/// passed by the control plane with a /configure HTTP request.
pub pspec: Option<ParsedSpec>,
pub compute_ctl_config: ComputeCtlConfig,
/// If the spec is passed by a /configure request, 'startup_span' is the
/// /configure request's tracing span. The main thread enters it when it
/// processes the compute startup, so that the compute startup is considered
@@ -163,6 +160,7 @@ impl ComputeState {
last_active: None,
error: None,
pspec: None,
compute_ctl_config: ComputeCtlConfig::default(),
startup_span: None,
metrics: ComputeMetrics::default(),
}
@@ -299,6 +297,79 @@ struct StartVmMonitorResult {
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String {
let roles = spec
.cluster
.roles
.iter()
.map(|r| escape_literal(&r.name))
.collect::<Vec<_>>();
let dbs = spec
.cluster
.databases
.iter()
.map(|db| escape_literal(&db.name))
.collect::<Vec<_>>();
let roles_decl = if roles.is_empty() {
String::from("roles text[] := NULL;")
} else {
format!(
r#"
roles text[] := ARRAY(SELECT rolname
FROM pg_catalog.pg_roles
WHERE rolname IN ({}));"#,
roles.join(", ")
)
};
let database_decl = if dbs.is_empty() {
String::from("dbs text[] := NULL;")
} else {
format!(
r#"
dbs text[] := ARRAY(SELECT datname
FROM pg_catalog.pg_database
WHERE datname IN ({}));"#,
dbs.join(", ")
)
};
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
let query = format!(
r#"
DO $$
DECLARE
r text;
{}
{}
BEGIN
IF NOT EXISTS (
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
IF array_length(roles, 1) IS NOT NULL THEN
EXECUTE format('GRANT neon_superuser TO %s',
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
FOREACH r IN ARRAY roles LOOP
EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
END LOOP;
END IF;
IF array_length(dbs, 1) IS NOT NULL THEN
EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
END IF;
END IF;
END
$$;"#,
roles_decl, database_decl,
);
query
}
impl ComputeNode {
pub fn new(
params: ComputeNodeParams,
@@ -316,6 +387,7 @@ impl ComputeNode {
let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
new_state.pspec = Some(pspec);
}
new_state.compute_ctl_config = compute_ctl_config;
Ok(ComputeNode {
params,
@@ -324,7 +396,6 @@ impl ComputeNode {
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_download_progress: RwLock::new(HashMap::new()),
compute_ctl_config,
})
}
@@ -347,7 +418,7 @@ impl ComputeNode {
// requests while configuration is still in progress.
crate::http::server::Server::External {
port: this.params.external_http_port,
config: this.compute_ctl_config.clone(),
jwks: this.state.lock().unwrap().compute_ctl_config.jwks.clone(),
compute_id: this.params.compute_id.clone(),
}
.launch(&this);
@@ -526,16 +597,6 @@ impl ComputeNode {
// Collect all the tasks that must finish here
let mut pre_tasks = tokio::task::JoinSet::new();
// Make sure TLS certificates are properly loaded and in the right place.
if self.compute_ctl_config.tls.is_some() {
let this = self.clone();
pre_tasks.spawn(async move {
this.watch_cert_for_changes().await;
Ok::<(), anyhow::Error>(())
});
}
// If there are any remote extensions in shared_preload_libraries, start downloading them
if pspec.spec.remote_extensions.is_some() {
let (this, spec) = (self.clone(), pspec.spec.clone());
@@ -591,13 +652,11 @@ impl ComputeNode {
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
info!("tuning pgbouncer");
let pgbouncer_settings = pgbouncer_settings.clone();
let tls_config = self.compute_ctl_config.tls.clone();
// Spawn a background task to do the tuning,
// so that we don't block the main thread that starts Postgres.
let pgbouncer_settings = pgbouncer_settings.clone();
let _handle = tokio::spawn(async move {
let res = tune_pgbouncer(pgbouncer_settings, tls_config).await;
let res = tune_pgbouncer(pgbouncer_settings).await;
if let Err(err) = res {
error!("error while tuning pgbouncer: {err:?}");
// Continue with the startup anyway
@@ -620,7 +679,7 @@ impl ComputeNode {
});
}
// Configure and start rsyslog for HIPAA if necessary
// Configure and start rsyslog if necessary
if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
if remote_endpoint.is_empty() {
@@ -628,22 +687,13 @@ impl ComputeNode {
}
let log_directory_path = Path::new(&self.params.pgdata).join("log");
let log_directory_path = log_directory_path.to_string_lossy().to_string();
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
// Launch a background task to clean up the audit logs
launch_pgaudit_gc(log_directory_path);
}
// Configure and start rsyslog for Postgres logs export
if self.has_feature(ComputeFeature::PostgresLogsExport) {
if let Some(ref project_id) = pspec.spec.cluster.cluster_id {
let host = PostgresLogsRsyslogConfig::default_host(project_id);
let conf = PostgresLogsRsyslogConfig::new(Some(&host));
configure_postgres_logs_export(conf)?;
} else {
warn!("not configuring rsyslog for Postgres logs export: project ID is missing")
}
// TODO: make this more robust
// now rsyslog starts once and there is no monitoring or restart if it fails
configure_and_start_rsyslog(
log_directory_path.to_str().unwrap(),
"hipaa",
&remote_endpoint,
)?;
}
// Launch remaining service threads
@@ -668,9 +718,9 @@ impl ComputeNode {
if pspec.spec.mode == ComputeMode::Primary {
self.configure_as_primary(&compute_state)?;
let conf = self.get_tokio_conn_conf(None);
tokio::task::spawn(async {
let res = get_installed_extensions(conf).await;
let conf = self.get_conn_conf(None);
tokio::task::spawn_blocking(|| {
let res = get_installed_extensions(conf);
match res {
Ok(extensions) => {
info!(
@@ -1128,10 +1178,9 @@ impl ComputeNode {
// Remove/create an empty pgdata directory and put configuration there.
self.create_pgdata()?;
config::write_postgres_conf(
pgdata_path,
&pgdata_path.join("postgresql.conf"),
&pspec.spec,
self.params.internal_http_port,
&self.compute_ctl_config.tls,
)?;
// Syncing safekeepers is only safe with primary nodes: if a primary
@@ -1513,13 +1562,11 @@ impl ComputeNode {
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
info!("tuning pgbouncer");
let pgbouncer_settings = pgbouncer_settings.clone();
let tls_config = self.compute_ctl_config.tls.clone();
// Spawn a background task to do the tuning,
// so that we don't block the main thread that starts Postgres.
let pgbouncer_settings = pgbouncer_settings.clone();
tokio::spawn(async move {
let res = tune_pgbouncer(pgbouncer_settings, tls_config).await;
let res = tune_pgbouncer(pgbouncer_settings).await;
if let Err(err) = res {
error!("error while tuning pgbouncer: {err:?}");
}
@@ -1531,8 +1578,7 @@ impl ComputeNode {
// Spawn a background task to do the configuration,
// so that we don't block the main thread that starts Postgres.
let mut local_proxy = local_proxy.clone();
local_proxy.tls = self.compute_ctl_config.tls.clone();
let local_proxy = local_proxy.clone();
tokio::spawn(async move {
if let Err(err) = local_proxy::configure(&local_proxy) {
error!("error while configuring local_proxy: {err:?}");
@@ -1542,12 +1588,8 @@ impl ComputeNode {
// Write new config
let pgdata_path = Path::new(&self.params.pgdata);
config::write_postgres_conf(
pgdata_path,
&spec,
self.params.internal_http_port,
&self.compute_ctl_config.tls,
)?;
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
config::write_postgres_conf(&postgresql_conf_path, &spec, self.params.internal_http_port)?;
if !spec.skip_pg_catalog_updates {
let max_concurrent_connections = spec.reconfigure_concurrency;
@@ -1618,56 +1660,6 @@ impl ComputeNode {
Ok(())
}
pub async fn watch_cert_for_changes(self: Arc<Self>) {
// update status on cert renewal
if let Some(tls_config) = &self.compute_ctl_config.tls {
let tls_config = tls_config.clone();
// wait until the cert exists.
let mut cert_watch = watch_cert_for_changes(tls_config.cert_path.clone()).await;
tokio::task::spawn_blocking(move || {
let handle = tokio::runtime::Handle::current();
'cert_update: loop {
// let postgres/pgbouncer/local_proxy know the new cert/key exists.
// we need to wait until it's configurable first.
let mut state = self.state.lock().unwrap();
'status_update: loop {
match state.status {
// let's update the state to config pending
ComputeStatus::ConfigurationPending | ComputeStatus::Running => {
state.set_status(
ComputeStatus::ConfigurationPending,
&self.state_changed,
);
break 'status_update;
}
// exit loop
ComputeStatus::Failed
| ComputeStatus::TerminationPending
| ComputeStatus::Terminated => break 'cert_update,
// wait
ComputeStatus::Init
| ComputeStatus::Configuration
| ComputeStatus::Empty => {
state = self.state_changed.wait(state).unwrap();
}
}
}
drop(state);
// wait for a new certificate update
if handle.block_on(cert_watch.changed()).is_err() {
break;
}
}
});
}
}
/// Update the `last_active` in the shared state, but ensure that it's a more recent one.
pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
let mut state = self.state.lock().unwrap();

View File

@@ -6,13 +6,11 @@ use std::io::Write;
use std::io::prelude::*;
use std::path::Path;
use compute_api::responses::TlsConfig;
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption};
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
use crate::pg_helpers::{
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
};
use crate::tls::{self, SERVER_CRT, SERVER_KEY};
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
@@ -40,12 +38,10 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
/// Create or completely rewrite configuration file specified by `path`
pub fn write_postgres_conf(
pgdata_path: &Path,
path: &Path,
spec: &ComputeSpec,
extension_server_port: u16,
tls_config: &Option<TlsConfig>,
) -> Result<()> {
let path = pgdata_path.join("postgresql.conf");
// File::create() destroys the file content if it exists.
let mut file = File::create(path)?;
@@ -90,20 +86,6 @@ pub fn write_postgres_conf(
)?;
}
// tls
if let Some(tls_config) = tls_config {
writeln!(file, "ssl = on")?;
// postgres requires the keyfile to be in a secure file,
// currently too complicated to ensure that at the VM level,
// so we just copy them to another file instead. :shrug:
tls::update_key_path_blocking(pgdata_path, tls_config);
// these are the default, but good to be explicit.
writeln!(file, "ssl_cert_file = '{}'", SERVER_CRT)?;
writeln!(file, "ssl_key_file = '{}'", SERVER_KEY)?;
}
// Locales
if cfg!(target_os = "macos") {
writeln!(file, "lc_messages='C'")?;
@@ -167,8 +149,7 @@ pub fn write_postgres_conf(
writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
// This log level is very verbose
// but this is necessary for HIPAA compliance.
// Exclude 'misc' category, because it doesn't contain anythig relevant.
writeln!(file, "pgaudit.log='all, -misc'")?;
writeln!(file, "pgaudit.log='all'")?;
writeln!(file, "pgaudit.log_parameter=on")?;
// Disable logging of catalog queries
// The catalog doesn't contain sensitive data, so we don't need to audit it.
@@ -216,12 +197,6 @@ pub fn write_postgres_conf(
writeln!(file, "neon.disable_logical_replication_subscribers=false")?;
}
// We need Postgres to send logs to rsyslog so that we can forward them
// further to customers' log aggregation systems.
if spec.features.contains(&ComputeFeature::PostgresLogsExport) {
writeln!(file, "log_destination='stderr,syslog'")?;
}
// This is essential to keep this line at the end of the file,
// because it is intended to override any settings above.
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;

View File

@@ -1,10 +0,0 @@
# Program name comes from postgres' syslog_facility configuration: https://www.postgresql.org/docs/current/runtime-config-logging.html#GUC-SYSLOG-IDENT
# Default value is 'postgres'.
if $programname == 'postgres' then {{
# Forward Postgres logs to telemetry otel collector
action(type="omfwd" target="{logs_export_target}" port="{logs_export_port}" protocol="tcp"
template="RSYSLOG_SyslogProtocol23Format"
action.resumeRetryCount="3"
queue.type="linkedList" queue.size="1000")
stop
}}

View File

@@ -4,8 +4,7 @@ module(load="imfile")
# Input configuration for log files in the specified directory
# Replace {log_directory} with the directory containing the log files
input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
# the directory to store rsyslog state files
global(workDirectory="/var/log/rsyslog")
global(workDirectory="/var/log")
# Forward logs to remote syslog server
*.* @@{remote_endpoint}
*.* @@{remote_endpoint}

View File

@@ -202,24 +202,8 @@ pub async fn download_extension(
// move contents of the libdir / sharedir in unzipped archive to the correct local paths
for paths in [sharedir_paths, libdir_paths] {
let (zip_dir, real_dir) = paths;
let dir = match std::fs::read_dir(&zip_dir) {
Ok(dir) => dir,
Err(e) => match e.kind() {
// In the event of a SQL-only extension, there would be nothing
// to move from the lib/ directory, so note that in the log and
// move on.
std::io::ErrorKind::NotFound => {
info!("nothing to move from {}", zip_dir);
continue;
}
_ => return Err(anyhow::anyhow!(e)),
},
};
info!("mv {zip_dir:?}/* {real_dir:?}");
for file in dir {
for file in std::fs::read_dir(zip_dir)? {
let old_file = file?.path();
let new_file =
Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);

View File

@@ -1,2 +1 @@
pub(in crate::http) mod authorize;
pub(in crate::http) mod request_id;

View File

@@ -1,16 +0,0 @@
use axum::{extract::Request, middleware::Next, response::Response};
use uuid::Uuid;
use crate::http::headers::X_REQUEST_ID;
/// This middleware function allows compute_ctl to generate its own request ID
/// if one isn't supplied. The control plane will always send one as a UUID. The
/// neon Postgres extension on the other hand does not send one.
pub async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
let headers = request.headers_mut();
if !headers.contains_key(X_REQUEST_ID) {
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
}
next.run(request).await
}

View File

@@ -306,36 +306,6 @@ paths:
schema:
$ref: "#/components/schemas/GenericError"
/configure_telemetry:
post:
tags:
- Configure
summary: Configure rsyslog
description: |
This API endpoint configures rsyslog to forward Postgres logs
to a specified otel collector.
operationId: configureTelemetry
requestBody:
required: true
content:
application/json:
schema:
type: object
properties:
logs_export_host:
type: string
description: |
Hostname and the port of the otel collector. Leave empty to disable logs forwarding.
Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526
responses:
204:
description: "Telemetry configured successfully"
500:
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:
JWT:

View File

@@ -1,11 +1,9 @@
use std::sync::Arc;
use axum::body::Body;
use axum::extract::State;
use axum::response::Response;
use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
use compute_api::spec::ComputeFeature;
use http::StatusCode;
use tokio::task;
use tracing::info;
@@ -13,7 +11,6 @@ use tracing::info;
use crate::compute::{ComputeNode, ParsedSpec};
use crate::http::JsonResponse;
use crate::http::extract::Json;
use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export};
// Accept spec in JSON format and request compute configuration. If anything
// goes wrong after we set the compute status to `ConfigurationPending` and
@@ -95,25 +92,3 @@ pub(in crate::http) async fn configure(
JsonResponse::success(StatusCode::OK, body)
}
pub(in crate::http) async fn configure_telemetry(
State(compute): State<Arc<ComputeNode>>,
request: Json<ConfigureTelemetryRequest>,
) -> Response {
if !compute.has_feature(ComputeFeature::PostgresLogsExport) {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"Postgres logs export feature is not enabled".to_string(),
);
}
let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref());
if let Err(err) = configure_postgres_logs_export(conf) {
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string());
}
Response::builder()
.status(StatusCode::NO_CONTENT)
.body(Body::from(""))
.unwrap()
}

View File

@@ -5,19 +5,20 @@ use std::time::Duration;
use anyhow::Result;
use axum::Router;
use axum::middleware::{self};
use axum::response::IntoResponse;
use axum::extract::Request;
use axum::middleware::{self, Next};
use axum::response::{IntoResponse, Response};
use axum::routing::{get, post};
use compute_api::responses::ComputeCtlConfig;
use http::StatusCode;
use jsonwebtoken::jwk::JwkSet;
use tokio::net::TcpListener;
use tower::ServiceBuilder;
use tower_http::{
auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
};
use tracing::{Span, error, info};
use uuid::Uuid;
use super::middleware::request_id::maybe_add_request_id_header;
use super::{
headers::X_REQUEST_ID,
middleware::authorize::Authorize,
@@ -41,7 +42,7 @@ pub enum Server {
},
External {
port: u16,
config: ComputeCtlConfig,
jwks: JwkSet,
compute_id: String,
},
}
@@ -79,7 +80,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
router
}
Server::External {
config, compute_id, ..
jwks, compute_id, ..
} => {
let unauthenticated_router =
Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
@@ -87,7 +88,6 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/configure_telemetry", post(configure::configure_telemetry))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
@@ -96,7 +96,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
.route("/terminate", post(terminate::terminate))
.layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
compute_id.clone(),
config.jwks.clone(),
jwks.clone(),
)));
router
@@ -219,3 +219,15 @@ impl Server {
tokio::spawn(self.serve(state));
}
}
/// This middleware function allows compute_ctl to generate its own request ID
/// if one isn't supplied. The control plane will always send one as a UUID. The
/// neon Postgres extension on the other hand does not send one.
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
let headers = request.headers_mut();
if headers.get(X_REQUEST_ID).is_none() {
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
}
next.run(request).await
}

View File

@@ -2,7 +2,7 @@ use std::collections::HashMap;
use anyhow::Result;
use compute_api::responses::{InstalledExtension, InstalledExtensions};
use tokio_postgres::{Client, Config, NoTls};
use postgres::{Client, NoTls};
use crate::metrics::INSTALLED_EXTENSIONS;
@@ -10,7 +10,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
/// and to make database listing query here more explicit.
///
/// Limit the number of databases to 500 to avoid excessive load.
async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
// `pg_database.datconnlimit = -2` means that the database is in the
// invalid state
let databases = client
@@ -20,8 +20,7 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
AND datconnlimit <> - 2
LIMIT 500",
&[],
)
.await?
)?
.iter()
.map(|row| {
let db: String = row.get("datname");
@@ -37,36 +36,20 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
/// Same extension can be installed in multiple databases with different versions,
/// so we report a separate metric (number of databases where it is installed)
/// for each extension version.
pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
conf.application_name("compute_ctl:get_installed_extensions");
let databases: Vec<String> = {
let (mut client, connection) = conf.connect(NoTls).await?;
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
list_dbs(&mut client).await?
};
let mut client = conf.connect(NoTls)?;
let databases: Vec<String> = list_dbs(&mut client)?;
let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
for db in databases.iter() {
conf.dbname(db);
let (client, connection) = conf.connect(NoTls).await?;
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
let extensions: Vec<(String, String, i32)> = client
let mut db_client = conf.connect(NoTls)?;
let extensions: Vec<(String, String, i32)> = db_client
.query(
"SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
&[],
)
.await?
)?
.iter()
.map(|row| {
(

View File

@@ -26,4 +26,3 @@ pub mod spec;
mod spec_apply;
pub mod swap;
pub mod sync_sk;
pub mod tls;

View File

@@ -1,8 +1,6 @@
use metrics::core::{AtomicF64, Collector, GenericGauge};
use metrics::core::Collector;
use metrics::proto::MetricFamily;
use metrics::{
IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec,
};
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
use once_cell::sync::Lazy;
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
@@ -61,20 +59,10 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
.expect("failed to define a metric")
});
// Size of audit log directory in bytes
pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
register_gauge!(
"compute_audit_log_dir_size",
"Size of audit log directory in bytes",
)
.expect("failed to define a metric")
});
pub fn collect() -> Vec<MetricFamily> {
let mut metrics = INSTALLED_EXTENSIONS.collect();
metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
metrics.extend(DB_MIGRATION_FAILED.collect());
metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
metrics
}

View File

@@ -10,10 +10,8 @@ use std::str::FromStr;
use std::time::{Duration, Instant};
use anyhow::{Result, bail};
use compute_api::responses::TlsConfig;
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
use futures::StreamExt;
use indexmap::IndexMap;
use ini::Ini;
use notify::{RecursiveMode, Watcher};
use postgres::config::Config;
@@ -188,40 +186,15 @@ impl DatabaseExt for Database {
/// Postgres SQL queries and DATABASE_URL.
pub trait Escaping {
fn pg_quote(&self) -> String;
fn pg_quote_dollar(&self) -> (String, String);
}
impl Escaping for PgIdent {
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
/// always quotes provided string with `""` and escapes every `"`.
/// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
/// N.B. it's not useful for escaping identifiers that are used inside WHERE
/// clause, use `escape_literal()` instead.
fn pg_quote(&self) -> String {
format!("\"{}\"", self.replace('"', "\"\""))
}
/// This helper is intended to be used for dollar-escaping strings for usage
/// inside PL/pgSQL procedures. In addition to dollar-escaping the string,
/// it also returns a tag that is intended to be used inside the outer
/// PL/pgSQL procedure. If you do not need an outer tag, just discard it.
/// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`,
/// <https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L2924>
fn pg_quote_dollar(&self) -> (String, String) {
let mut tag: String = "x".to_string();
let mut outer_tag = "xx".to_string();
// Find the first suitable tag that is not present in the string.
// Postgres' max role/DB name length is 63 bytes, so even in the
// worst case it won't take long.
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
tag += "x";
outer_tag = tag.clone() + "x";
}
let escaped = format!("${tag}${self}${tag}$");
(escaped, outer_tag)
let result = format!("\"{}\"", self.replace('"', "\"\""));
result
}
}
@@ -253,13 +226,10 @@ pub async fn get_existing_dbs_async(
// invalid state. See:
// https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
let rowstream = client
// We use a subquery instead of a fancy `datdba::regrole::text AS owner`,
// because the latter automatically wraps the result in double quotes,
// if the role name contains special characters.
.query_raw::<str, &String, &[String; 0]>(
"SELECT
datname AS name,
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
datdba::regrole::text AS owner,
NOT datallowconn AS restrict_conn,
datconnlimit = - 2 AS invalid
FROM
@@ -408,7 +378,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
/// Update pgbouncer.ini with provided options
fn update_pgbouncer_ini(
pgbouncer_config: IndexMap<String, String>,
pgbouncer_config: HashMap<String, String>,
pgbouncer_ini_path: &str,
) -> Result<()> {
let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
@@ -429,10 +399,7 @@ fn update_pgbouncer_ini(
/// Tune pgbouncer.
/// 1. Apply new config using pgbouncer admin console
/// 2. Add new values to pgbouncer.ini to preserve them after restart
pub async fn tune_pgbouncer(
mut pgbouncer_config: IndexMap<String, String>,
tls_config: Option<TlsConfig>,
) -> Result<()> {
pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
// for VMs use pgbouncer specific way to connect to
// pgbouncer admin console without password
@@ -478,21 +445,19 @@ pub async fn tune_pgbouncer(
}
};
if let Some(tls_config) = tls_config {
// pgbouncer starts in a half-ok state if it cannot find these files.
// It will default to client_tls_sslmode=deny, which causes proxy to error.
// There is a small window at startup where these files don't yet exist in the VM.
// Best to wait until it exists.
loop {
if let Ok(true) = tokio::fs::try_exists(&tls_config.key_path).await {
break;
}
tokio::time::sleep(Duration::from_millis(500)).await
}
// Apply new config
for (option_name, value) in pgbouncer_config.iter() {
let query = format!("SET {}={}", option_name, value);
// keep this log line for debugging purposes
info!("Applying pgbouncer setting change: {}", query);
pgbouncer_config.insert("client_tls_cert_file".to_string(), tls_config.cert_path);
pgbouncer_config.insert("client_tls_key_file".to_string(), tls_config.key_path);
pgbouncer_config.insert("client_tls_sslmode".to_string(), "allow".to_string());
if let Err(err) = client.simple_query(&query).await {
// Don't fail on error, just print it into log
error!(
"Failed to apply pgbouncer setting change: {}, {}",
query, err
);
};
}
// save values to pgbouncer.ini
@@ -508,13 +473,6 @@ pub async fn tune_pgbouncer(
};
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
info!("Applying pgbouncer setting change");
if let Err(err) = client.simple_query("RELOAD").await {
// Don't fail on error, just print it into log
error!("Failed to apply pgbouncer setting change, {err}",);
};
Ok(())
}

View File

@@ -1,14 +1,8 @@
use std::fs;
use std::io::ErrorKind;
use std::path::Path;
use std::process::Command;
use std::time::Duration;
use std::{fs::OpenOptions, io::Write};
use anyhow::{Context, Result, anyhow};
use tracing::{error, info, instrument, warn};
const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf";
use anyhow::{Context, Result};
use tracing::info;
fn get_rsyslog_pid() -> Option<String> {
let output = Command::new("pgrep")
@@ -27,34 +21,40 @@ fn get_rsyslog_pid() -> Option<String> {
}
}
// Restart rsyslogd to apply the new configuration.
// This is necessary, because there is no other way to reload the rsyslog configuration.
//
// Rsyslogd shouldn't lose any messages, because of the restart,
// because it tracks the last read position in the log files
// and will continue reading from that position.
// TODO: test it properly
//
fn restart_rsyslog() -> Result<()> {
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
info!("rsyslogd is running with pid: {}, restart it", old_pid);
// Start rsyslogd with the specified configuration file
// If it is already running, do nothing.
fn start_rsyslog(rsyslog_conf_path: &str) -> Result<()> {
let pid = get_rsyslog_pid();
if let Some(pid) = pid {
info!("rsyslogd is already running with pid: {}", pid);
return Ok(());
}
// kill it to restart
let _ = Command::new("pkill")
.arg("rsyslogd")
let _ = Command::new("/usr/sbin/rsyslogd")
.arg("-f")
.arg(rsyslog_conf_path)
.arg("-i")
.arg("/var/run/rsyslogd/rsyslogd.pid")
.output()
.context("Failed to stop rsyslogd")?;
.context("Failed to start rsyslogd")?;
// Check that rsyslogd is running
if let Some(pid) = get_rsyslog_pid() {
info!("rsyslogd started successfully with pid: {}", pid);
} else {
return Err(anyhow::anyhow!("Failed to start rsyslogd"));
}
Ok(())
}
pub fn configure_audit_rsyslog(
log_directory: String,
pub fn configure_and_start_rsyslog(
log_directory: &str,
tag: &str,
remote_endpoint: &str,
) -> Result<()> {
let config_content: String = format!(
include_str!("config_template/compute_audit_rsyslog_template.conf"),
include_str!("config_template/compute_rsyslog_template.conf"),
log_directory = log_directory,
tag = tag,
remote_endpoint = remote_endpoint
@@ -62,7 +62,7 @@ pub fn configure_audit_rsyslog(
info!("rsyslog config_content: {}", config_content);
let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf";
let rsyslog_conf_path = "/etc/compute_rsyslog.conf";
let mut file = OpenOptions::new()
.create(true)
.write(true)
@@ -71,206 +71,10 @@ pub fn configure_audit_rsyslog(
file.write_all(config_content.as_bytes())?;
info!(
"rsyslog configuration file {} added successfully. Starting rsyslogd",
rsyslog_conf_path
);
info!("rsyslog configuration added successfully. Starting rsyslogd");
// start the service, using the configuration
restart_rsyslog()?;
start_rsyslog(rsyslog_conf_path)?;
Ok(())
}
/// Configuration for enabling Postgres logs forwarding from rsyslogd
pub struct PostgresLogsRsyslogConfig<'a> {
pub host: Option<&'a str>,
}
impl<'a> PostgresLogsRsyslogConfig<'a> {
pub fn new(host: Option<&'a str>) -> Self {
Self { host }
}
pub fn build(&self) -> Result<String> {
match self.host {
Some(host) => {
if let Some((target, port)) = host.split_once(":") {
Ok(format!(
include_str!(
"config_template/compute_rsyslog_postgres_export_template.conf"
),
logs_export_target = target,
logs_export_port = port,
))
} else {
Err(anyhow!("Invalid host format for Postgres logs export"))
}
}
None => Ok("".to_string()),
}
}
fn current_config() -> Result<String> {
let config_content = match std::fs::read_to_string(POSTGRES_LOGS_CONF_PATH) {
Ok(c) => c,
Err(err) if err.kind() == ErrorKind::NotFound => String::new(),
Err(err) => return Err(err.into()),
};
Ok(config_content)
}
/// Returns the default host for otel collector that receives Postgres logs
pub fn default_host(project_id: &str) -> String {
format!(
"config-{}-collector.neon-telemetry.svc.cluster.local:10514",
project_id
)
}
}
pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> {
let new_config = conf.build()?;
let current_config = PostgresLogsRsyslogConfig::current_config()?;
if new_config == current_config {
info!("postgres logs rsyslog configuration is up-to-date");
return Ok(());
}
// When new config is empty we can simply remove the configuration file.
if new_config.is_empty() {
info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH);
match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) {
Ok(_) => {}
Err(err) if err.kind() == ErrorKind::NotFound => {}
Err(err) => return Err(err.into()),
}
restart_rsyslog()?;
return Ok(());
}
info!(
"configuring rsyslog for postgres logs export to: {:?}",
conf.host
);
let mut file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(POSTGRES_LOGS_CONF_PATH)?;
file.write_all(new_config.as_bytes())?;
info!(
"rsyslog configuration file {} added successfully. Starting rsyslogd",
POSTGRES_LOGS_CONF_PATH
);
restart_rsyslog()?;
Ok(())
}
#[instrument(skip_all)]
async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> {
info!("running pgaudit GC main loop");
loop {
// Check log_directory for old pgaudit logs and delete them.
// New log files are checked every 5 minutes, as set in pgaudit.log_rotation_age
// Find files that were not modified in the last 15 minutes and delete them.
// This should be enough time for rsyslog to process the logs and for us to catch the alerts.
//
// In case of a very high load, we might need to adjust this value and pgaudit.log_rotation_age.
//
// TODO: add some smarter logic to delete the files that are fully streamed according to rsyslog
// imfile-state files, but for now just do a simple GC to avoid filling up the disk.
let _ = Command::new("find")
.arg(&log_directory)
.arg("-name")
.arg("audit*.log")
.arg("-mmin")
.arg("+15")
.arg("-delete")
.output()?;
// also collect the metric for the size of the log directory
async fn get_log_files_size(path: &Path) -> Result<u64> {
let mut total_size = 0;
for entry in fs::read_dir(path)? {
let entry = entry?;
let entry_path = entry.path();
if entry_path.is_file() && entry_path.to_string_lossy().ends_with("log") {
total_size += entry.metadata()?.len();
}
}
Ok(total_size)
}
let log_directory_size = get_log_files_size(Path::new(&log_directory))
.await
.unwrap_or_else(|e| {
warn!("Failed to get log directory size: {}", e);
0
});
crate::metrics::AUDIT_LOG_DIR_SIZE.set(log_directory_size as f64);
tokio::time::sleep(Duration::from_secs(60)).await;
}
}
// launch pgaudit GC thread to clean up the old pgaudit logs stored in the log_directory
pub fn launch_pgaudit_gc(log_directory: String) {
tokio::spawn(async move {
if let Err(e) = pgaudit_gc_main_loop(log_directory).await {
error!("pgaudit GC main loop failed: {}", e);
}
});
}
#[cfg(test)]
mod tests {
use crate::rsyslog::PostgresLogsRsyslogConfig;
#[test]
fn test_postgres_logs_config() {
{
// Verify empty config
let conf = PostgresLogsRsyslogConfig::new(None);
let res = conf.build();
assert!(res.is_ok());
let conf_str = res.unwrap();
assert_eq!(&conf_str, "");
}
{
// Verify config
let conf = PostgresLogsRsyslogConfig::new(Some("collector.cvc.local:514"));
let res = conf.build();
assert!(res.is_ok());
let conf_str = res.unwrap();
assert!(conf_str.contains("omfwd"));
assert!(conf_str.contains(r#"target="collector.cvc.local""#));
assert!(conf_str.contains(r#"port="514""#));
}
{
// Verify invalid config
let conf = PostgresLogsRsyslogConfig::new(Some("invalid"));
let res = conf.build();
assert!(res.is_err());
}
{
// Verify config with default host
let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123");
let conf = PostgresLogsRsyslogConfig::new(Some(&host));
let res = conf.build();
assert!(res.is_ok());
let conf_str = res.unwrap();
assert!(conf_str.contains(r#"shy-breeze-123"#));
assert!(conf_str.contains(r#"port="10514""#));
}
}
}

View File

@@ -8,12 +8,13 @@ use compute_api::responses::{
use compute_api::spec::ComputeSpec;
use reqwest::StatusCode;
use tokio_postgres::Client;
use tracing::{error, info, instrument};
use tracing::{error, info, instrument, warn};
use crate::config;
use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS};
use crate::migration::MigrationRunner;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
// Do control plane request and return response if any. In case of error it
// returns a bool flag indicating whether it makes sense to retry the request
@@ -211,3 +212,122 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
Ok(())
}
/// Connect to the database as superuser and pre-create anon extension
/// if it is present in shared_preload_libraries
#[instrument(skip_all)]
pub async fn handle_extension_anon(
spec: &ComputeSpec,
db_owner: &str,
db_client: &mut Client,
grants_only: bool,
) -> Result<()> {
info!("handle extension anon");
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
if libs.contains("anon") {
if !grants_only {
// check if extension is already initialized using anon.is_initialized()
let query = "SELECT anon.is_initialized()";
match db_client.query(query, &[]).await {
Ok(rows) => {
if !rows.is_empty() {
let is_initialized: bool = rows[0].get(0);
if is_initialized {
info!("anon extension is already initialized");
return Ok(());
}
}
}
Err(e) => {
warn!(
"anon extension is_installed check failed with expected error: {}",
e
);
}
};
// Create anon extension if this compute needs it
// Users cannot create it themselves, because superuser is required.
let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
info!("creating anon extension with query: {}", query);
match db_client.query(query, &[]).await {
Ok(_) => {}
Err(e) => {
error!("anon extension creation failed with error: {}", e);
return Ok(());
}
}
// check that extension is installed
query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
let rows = db_client.query(query, &[]).await?;
if rows.is_empty() {
error!("anon extension is not installed");
return Ok(());
}
// Initialize anon extension
// This also requires superuser privileges, so users cannot do it themselves.
query = "SELECT anon.init()";
match db_client.query(query, &[]).await {
Ok(_) => {}
Err(e) => {
error!("anon.init() failed with error: {}", e);
return Ok(());
}
}
}
// check that extension is installed, if not bail early
let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
match db_client.query(query, &[]).await {
Ok(rows) => {
if rows.is_empty() {
error!("anon extension is not installed");
return Ok(());
}
}
Err(e) => {
error!("anon extension check failed with error: {}", e);
return Ok(());
}
};
let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query).await?;
// Grant permissions to db_owner to use anon extension functions
let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query).await?;
// This is needed, because some functions are defined as SECURITY DEFINER.
// In Postgres SECURITY DEFINER functions are executed with the privileges
// of the owner.
// In anon extension this it is needed to access some GUCs, which are only accessible to
// superuser. But we've patched postgres to allow db_owner to access them as well.
// So we need to change owner of these functions to db_owner.
let query = format!("
SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};'
from pg_proc p
join pg_namespace nsp ON p.pronamespace = nsp.oid
where nsp.nspname = 'anon';", db_owner);
info!("change anon extension functions owner to db owner");
db_client.simple_query(&query).await?;
// affects views as well
let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query).await?;
let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query).await?;
}
}
Ok(())
}

View File

@@ -6,27 +6,26 @@ use std::sync::Arc;
use anyhow::{Context, Result};
use compute_api::responses::ComputeStatus;
use compute_api::spec::{ComputeAudit, ComputeSpec, Database, PgIdent, Role};
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role};
use futures::future::join_all;
use tokio::sync::RwLock;
use tokio_postgres::Client;
use tokio_postgres::error::SqlState;
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
use crate::compute::{ComputeNode, ComputeState};
use crate::compute::{ComputeNode, ComputeState, construct_superuser_query};
use crate::pg_helpers::{
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async,
get_existing_roles_async,
};
use crate::spec_apply::ApplySpecPhase::{
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
RunInEachDatabase,
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
CreatePgauditlogtofileExtension, CreateSchemaNeon, CreateSuperUser, DisablePostgresDBPgAudit,
DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, HandleNeonExtension,
HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
};
use crate::spec_apply::PerDatabasePhase::{
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions,
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
};
impl ComputeNode {
@@ -188,7 +187,7 @@ impl ComputeNode {
}
for phase in [
CreateNeonSuperuser,
CreateSuperUser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -238,6 +237,7 @@ impl ComputeNode {
let mut phases = vec![
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
];
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
@@ -457,6 +457,7 @@ impl Debug for DB {
pub enum PerDatabasePhase {
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
/// This is a shared phase, used for both i) dropping dangling LR subscriptions
/// before dropping the DB, and ii) dropping all subscriptions after creating
/// a fresh branch.
@@ -467,7 +468,7 @@ pub enum PerDatabasePhase {
#[derive(Clone, Debug)]
pub enum ApplySpecPhase {
CreateNeonSuperuser,
CreateSuperUser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -594,10 +595,14 @@ async fn get_operations<'a>(
apply_spec_phase: &'a ApplySpecPhase,
) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
match apply_spec_phase {
ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
query: include_str!("sql/create_neon_superuser.sql").to_string(),
comment: None,
}))),
ApplySpecPhase::CreateSuperUser => {
let query = construct_superuser_query(spec);
Ok(Box::new(once(Operation {
query,
comment: None,
})))
}
ApplySpecPhase::DropInvalidDatabases => {
let mut ctx = ctx.write().await;
let databases = &mut ctx.dbs;
@@ -731,15 +736,14 @@ async fn get_operations<'a>(
// We do not check whether the DB exists or not,
// Postgres will take care of it for us
"delete_db" => {
let (db_name, outer_tag) = op.name.pg_quote_dollar();
// In Postgres we can't drop a database if it is a template.
// So we need to unset the template flag first, but it could
// be a retry, so we could've already dropped the database.
// Check that database exists first to make it idempotent.
let unset_template_query: String = format!(
include_str!("sql/unset_template_for_drop_dbs.sql"),
datname = db_name,
outer_tag = outer_tag,
datname_str = escape_literal(&op.name),
datname = &op.name.pg_quote()
);
// Use FORCE to drop database even if there are active connections.
@@ -846,8 +850,6 @@ async fn get_operations<'a>(
comment: None,
},
Operation {
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
query: format!(
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
db.name.pg_quote()
@@ -907,11 +909,9 @@ async fn get_operations<'a>(
PerDatabasePhase::DropLogicalSubscriptions => {
match &db {
DB::UserDB(db) => {
let (db_name, outer_tag) = db.name.pg_quote_dollar();
let drop_subscription_query: String = format!(
include_str!("sql/drop_subscriptions.sql"),
datname_str = db_name,
outer_tag = outer_tag,
datname_str = escape_literal(&db.name),
);
let operations = vec![Operation {
@@ -950,7 +950,6 @@ async fn get_operations<'a>(
DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
DB::UserDB(db) => db.owner.pg_quote(),
};
let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
Some(vec![
// This will reassign all dependent objects to the db owner
@@ -965,9 +964,7 @@ async fn get_operations<'a>(
Operation {
query: format!(
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
role_name = escaped_role,
outer_tag = outer_tag,
role_name = quoted,
),
comment: None,
},
@@ -992,14 +989,12 @@ async fn get_operations<'a>(
DB::SystemDB => return Ok(Box::new(empty())),
DB::UserDB(db) => db,
};
let (db_owner, outer_tag) = db.owner.pg_quote_dollar();
let operations = vec![
Operation {
query: format!(
include_str!("sql/set_public_schema_owner.sql"),
db_owner = db_owner,
outer_tag = outer_tag,
db_owner = db.owner.pg_quote()
),
comment: None,
},
@@ -1010,6 +1005,98 @@ async fn get_operations<'a>(
]
.into_iter();
Ok(Box::new(operations))
}
// TODO: remove this completely https://github.com/neondatabase/cloud/issues/22663
PerDatabasePhase::HandleAnonExtension => {
// Only install Anon into user databases
let db = match &db {
DB::SystemDB => return Ok(Box::new(empty())),
DB::UserDB(db) => db,
};
// Never install Anon when it's not enabled as feature
if !spec.features.contains(&ComputeFeature::AnonExtension) {
return Ok(Box::new(empty()));
}
// Only install Anon when it's added in preload libraries
let opt_libs = spec.cluster.settings.find("shared_preload_libraries");
let libs = match opt_libs {
Some(libs) => libs,
None => return Ok(Box::new(empty())),
};
if !libs.contains("anon") {
return Ok(Box::new(empty()));
}
let db_owner = db.owner.pg_quote();
let operations = vec![
// Create anon extension if this compute needs it
// Users cannot create it themselves, because superuser is required.
Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS anon CASCADE"),
comment: Some(String::from("creating anon extension")),
},
// Initialize anon extension
// This also requires superuser privileges, so users cannot do it themselves.
Operation {
query: String::from("SELECT anon.init()"),
comment: Some(String::from("initializing anon extension data")),
},
Operation {
query: format!("GRANT ALL ON SCHEMA anon TO {}", db_owner),
comment: Some(String::from(
"granting anon extension schema permissions",
)),
},
Operation {
query: format!(
"GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}",
db_owner
),
comment: Some(String::from(
"granting anon extension schema functions permissions",
)),
},
// We need this, because some functions are defined as SECURITY DEFINER.
// In Postgres SECURITY DEFINER functions are executed with the privileges
// of the owner.
// In anon extension this it is needed to access some GUCs, which are only accessible to
// superuser. But we've patched postgres to allow db_owner to access them as well.
// So we need to change owner of these functions to db_owner.
Operation {
query: format!(
include_str!("sql/anon_ext_fn_reassign.sql"),
db_owner = db_owner,
),
comment: Some(String::from(
"change anon extension functions owner to database_owner",
)),
},
Operation {
query: format!(
"GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}",
db_owner,
),
comment: Some(String::from(
"granting anon extension tables permissions",
)),
},
Operation {
query: format!(
"GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}",
db_owner,
),
comment: Some(String::from(
"granting anon extension sequences permissions",
)),
},
]
.into_iter();
Ok(Box::new(operations))
}
}

View File

@@ -1,8 +0,0 @@
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
END IF;
END
$$;

View File

@@ -1,4 +1,4 @@
DO ${outer_tag}$
DO $$
DECLARE
subname TEXT;
BEGIN
@@ -9,4 +9,4 @@ BEGIN
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
END LOOP;
END;
${outer_tag}$;
$$;

View File

@@ -1,7 +1,8 @@
DO ${outer_tag}$
SET SESSION ROLE neon_superuser;
DO $$
DECLARE
schema TEXT;
grantor TEXT;
revoke_query TEXT;
BEGIN
FOR schema IN
@@ -14,25 +15,14 @@ BEGIN
-- ii) it's easy to add more schemas to the list if needed.
WHERE schema_name IN ('public')
LOOP
FOR grantor IN EXECUTE
format(
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
quote_literal({role_name})
)
LOOP
EXECUTE format('SET LOCAL ROLE %I', grantor);
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
schema
);
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
schema,
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
{role_name},
grantor
);
EXECUTE revoke_query;
END LOOP;
EXECUTE revoke_query;
END LOOP;
END;
${outer_tag}$;
$$;
RESET ROLE;

View File

@@ -1,4 +1,5 @@
DO ${outer_tag}$
DO
$$
DECLARE
schema_owner TEXT;
BEGIN
@@ -15,8 +16,8 @@ DO ${outer_tag}$
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
THEN
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
ALTER SCHEMA public OWNER TO {db_owner};
END IF;
END IF;
END
${outer_tag}$;
$$;

View File

@@ -1,12 +1,12 @@
DO ${outer_tag}$
DO $$
BEGIN
IF EXISTS(
SELECT 1
FROM pg_catalog.pg_database
WHERE datname = {datname}
WHERE datname = {datname_str}
)
THEN
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
ALTER DATABASE {datname} is_template false;
END IF;
END
${outer_tag}$;
$$;

View File

@@ -1,118 +0,0 @@
use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration};
use anyhow::{Context, Result, bail};
use compute_api::responses::TlsConfig;
use ring::digest;
use spki::ObjectIdentifier;
use spki::der::{Decode, PemReader};
use x509_cert::Certificate;
#[derive(Clone, Copy)]
pub struct CertDigest(digest::Digest);
pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver<CertDigest> {
let mut digest = compute_digest(&cert_path).await;
let (tx, rx) = tokio::sync::watch::channel(digest);
tokio::spawn(async move {
while !tx.is_closed() {
let new_digest = compute_digest(&cert_path).await;
if digest.0.as_ref() != new_digest.0.as_ref() {
digest = new_digest;
_ = tx.send(digest);
}
tokio::time::sleep(Duration::from_secs(60)).await
}
});
rx
}
async fn compute_digest(cert_path: &str) -> CertDigest {
loop {
match try_compute_digest(cert_path).await {
Ok(d) => break d,
Err(e) => {
tracing::error!("could not read cert file {e:?}");
tokio::time::sleep(Duration::from_secs(1)).await
}
}
}
}
async fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
let data = tokio::fs::read(cert_path).await?;
// sha256 is extremely collision resistent. can safely assume the digest to be unique
Ok(CertDigest(digest::digest(&digest::SHA256, &data)))
}
pub const SERVER_CRT: &str = "server.crt";
pub const SERVER_KEY: &str = "server.key";
pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) {
loop {
match try_update_key_path_blocking(pg_data, tls_config) {
Ok(()) => break,
Err(e) => {
tracing::error!("could not create key file {e:?}");
std::thread::sleep(Duration::from_secs(1))
}
}
}
}
// Postgres requires the keypath be "secure". This means
// 1. Owned by the postgres user.
// 2. Have permission 600.
fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> {
let key = std::fs::read_to_string(&tls_config.key_path)?;
let crt = std::fs::read_to_string(&tls_config.cert_path)?;
// to mitigate a race condition during renewal.
verify_key_cert(&key, &crt)?;
let mut key_file = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.mode(0o600)
.open(pg_data.join(SERVER_KEY))?;
let mut crt_file = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.mode(0o600)
.open(pg_data.join(SERVER_CRT))?;
key_file.write_all(key.as_bytes())?;
crt_file.write_all(crt.as_bytes())?;
Ok(())
}
fn verify_key_cert(key: &str, cert: &str) -> Result<()> {
const ECDSA_WITH_SHA256: ObjectIdentifier = ObjectIdentifier::new_unwrap("1.2.840.10045.4.3.2");
let cert = Certificate::decode(&mut PemReader::new(cert.as_bytes()).context("pem reader")?)
.context("decode cert")?;
match cert.signature_algorithm.oid {
ECDSA_WITH_SHA256 => {
let key = p256::SecretKey::from_sec1_pem(key).context("parse key")?;
let a = key.public_key().to_sec1_bytes();
let b = cert
.tbs_certificate
.subject_public_key_info
.subject_public_key
.raw_bytes();
if *a != *b {
bail!("private key file does not match certificate")
}
}
_ => bail!("unknown TLS key type"),
}
Ok(())
}

View File

@@ -61,24 +61,6 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
}
#[test]
fn ident_pg_quote_dollar() {
let test_cases = vec![
("name", ("$x$name$x$", "xx")),
("name$", ("$x$name$$x$", "xx")),
("name$$", ("$x$name$$$x$", "xx")),
("name$$$", ("$x$name$$$$x$", "xx")),
("name$$$$", ("$x$name$$$$$x$", "xx")),
("name$x$", ("$xx$name$x$$xx$", "xxx")),
];
for (input, expected) in test_cases {
let (escaped, tag) = PgIdent::from(input).pg_quote_dollar();
assert_eq!(escaped, expected.0);
assert_eq!(tag, expected.1);
}
}
#[test]
fn generic_options_search() {
let generic_options: GenericOptions = Some(vec![

View File

@@ -36,9 +36,7 @@ use pageserver_api::config::{
use pageserver_api::controller_api::{
NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest,
};
use pageserver_api::models::{
ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
@@ -965,7 +963,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
id: pageserver_id,
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
listen_http_addr: format!("127.0.0.1:{http_port}"),
listen_https_addr: None,
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
other: Default::default(),
@@ -979,8 +976,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
neon_distrib_dir: None,
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
storage_controller: None,
control_plane_hooks_api: None,
generate_local_ssl_certs: false,
control_plane_compute_hook_api: None,
}
};
@@ -1131,16 +1127,12 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
let tenant_id = get_tenant_id(args.tenant_id, env)?;
let tenant_conf: HashMap<_, _> =
args.config.iter().flat_map(|c| c.split_once(':')).collect();
let config = PageServerNode::parse_config(tenant_conf)?;
let req = TenantConfigRequest { tenant_id, config };
let storage_controller = StorageController::from_env(env);
storage_controller
.set_tenant_config(&req)
pageserver
.tenant_config(tenant_id, tenant_conf)
.await
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
println!("tenant {tenant_id} successfully configured via storcon");
println!("tenant {tenant_id} successfully configured on the pageserver");
}
}
Ok(())

View File

@@ -72,19 +72,15 @@ pub struct LocalEnv {
// be propagated into each pageserver's configuration.
pub control_plane_api: Url,
// Control plane upcall APIs for storage controller. If set, this will be propagated into the
// Control plane upcall API for storage controller. If set, this will be propagated into the
// storage controller's configuration.
pub control_plane_hooks_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
/// Flag to generate SSL certificates for components that need it.
/// Also generates root CA certificate that is used to sign all other certificates.
pub generate_local_ssl_certs: bool,
}
/// On-disk state stored in `.neon/config`.
@@ -104,13 +100,8 @@ pub struct OnDiskConfig {
pub pageservers: Vec<PageServerConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
// Note: skip serializing because in compat tests old storage controller fails
// to load new config file. May be removed after this field is in release branch.
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub generate_local_ssl_certs: bool,
}
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
@@ -137,8 +128,7 @@ pub struct NeonLocalInitConf {
pub pageservers: Vec<NeonLocalInitPageserverConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub generate_local_ssl_certs: bool,
pub control_plane_compute_hook_api: Option<Option<Url>>,
}
/// Broker config for cluster internal communication.
@@ -149,7 +139,7 @@ pub struct NeonBroker {
pub listen_addr: SocketAddr,
}
/// A part of storage controller's config the neon_local knows about.
/// Broker config for cluster internal communication.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct NeonStorageControllerConf {
@@ -175,12 +165,6 @@ pub struct NeonStorageControllerConf {
#[serde(with = "humantime_serde")]
pub long_reconcile_threshold: Option<Duration>,
pub use_https_pageserver_api: bool,
pub timelines_onto_safekeepers: bool,
pub use_https_safekeeper_api: bool,
}
impl NeonStorageControllerConf {
@@ -204,9 +188,6 @@ impl Default for NeonStorageControllerConf {
max_secondary_lag_bytes: None,
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
long_reconcile_threshold: None,
use_https_pageserver_api: false,
timelines_onto_safekeepers: false,
use_https_safekeeper_api: false,
}
}
}
@@ -236,7 +217,6 @@ pub struct PageServerConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub no_sync: bool,
@@ -248,7 +228,6 @@ impl Default for PageServerConf {
id: NodeId(0),
listen_pg_addr: String::new(),
listen_http_addr: String::new(),
listen_https_addr: None,
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
no_sync: false,
@@ -264,7 +243,6 @@ pub struct NeonLocalInitPageserverConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
@@ -279,7 +257,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
id,
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -289,7 +266,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
id: *id,
listen_pg_addr: listen_pg_addr.clone(),
listen_http_addr: listen_http_addr.clone(),
listen_https_addr: listen_https_addr.clone(),
pg_auth_type: *pg_auth_type,
http_auth_type: *http_auth_type,
no_sync: *no_sync,
@@ -304,7 +280,6 @@ pub struct SafekeeperConf {
pub pg_port: u16,
pub pg_tenant_only_port: Option<u16>,
pub http_port: u16,
pub https_port: Option<u16>,
pub sync: bool,
pub remote_storage: Option<String>,
pub backup_threads: Option<u32>,
@@ -319,7 +294,6 @@ impl Default for SafekeeperConf {
pg_port: 0,
pg_tenant_only_port: None,
http_port: 0,
https_port: None,
sync: true,
remote_storage: None,
backup_threads: None,
@@ -436,41 +410,6 @@ impl LocalEnv {
}
}
pub fn ssl_ca_cert_path(&self) -> Option<PathBuf> {
if self.generate_local_ssl_certs {
Some(self.base_data_dir.join("rootCA.crt"))
} else {
None
}
}
pub fn ssl_ca_key_path(&self) -> Option<PathBuf> {
if self.generate_local_ssl_certs {
Some(self.base_data_dir.join("rootCA.key"))
} else {
None
}
}
pub fn generate_ssl_ca_cert(&self) -> anyhow::Result<()> {
let cert_path = self.ssl_ca_cert_path().unwrap();
let key_path = self.ssl_ca_key_path().unwrap();
if !fs::exists(cert_path.as_path())? {
generate_ssl_ca_cert(cert_path.as_path(), key_path.as_path())?;
}
Ok(())
}
pub fn generate_ssl_cert(&self, cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
self.generate_ssl_ca_cert()?;
generate_ssl_cert(
cert_path,
key_path,
self.ssl_ca_cert_path().unwrap().as_path(),
self.ssl_ca_key_path().unwrap().as_path(),
)
}
/// Inspect the base data directory and extract the instance id and instance directory path
/// for all storage controller instances
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
@@ -578,10 +517,8 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api,
control_plane_hooks_api,
control_plane_compute_hook_api: _,
control_plane_compute_hook_api,
branch_name_mappings,
generate_local_ssl_certs,
} = on_disk_config;
LocalEnv {
base_data_dir: repopath.to_owned(),
@@ -594,9 +531,8 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api: control_plane_api.unwrap(),
control_plane_hooks_api,
control_plane_compute_hook_api,
branch_name_mappings,
generate_local_ssl_certs,
}
};
@@ -632,7 +568,6 @@ impl LocalEnv {
struct PageserverConfigTomlSubset {
listen_pg_addr: String,
listen_http_addr: String,
listen_https_addr: Option<String>,
pg_auth_type: AuthType,
http_auth_type: AuthType,
#[serde(default)]
@@ -657,7 +592,6 @@ impl LocalEnv {
let PageserverConfigTomlSubset {
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -675,7 +609,6 @@ impl LocalEnv {
},
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -701,10 +634,8 @@ impl LocalEnv {
pageservers: vec![], // it's skip_serializing anyway
safekeepers: self.safekeepers.clone(),
control_plane_api: Some(self.control_plane_api.clone()),
control_plane_hooks_api: self.control_plane_hooks_api.clone(),
control_plane_compute_hook_api: None,
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
branch_name_mappings: self.branch_name_mappings.clone(),
generate_local_ssl_certs: self.generate_local_ssl_certs,
},
)
}
@@ -786,8 +717,7 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api,
generate_local_ssl_certs,
control_plane_hooks_api,
control_plane_compute_hook_api,
} = conf;
// Find postgres binaries.
@@ -834,24 +764,16 @@ impl LocalEnv {
pageservers: pageservers.iter().map(Into::into).collect(),
safekeepers,
control_plane_api: control_plane_api.unwrap(),
control_plane_hooks_api,
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
branch_name_mappings: Default::default(),
generate_local_ssl_certs,
};
if generate_local_ssl_certs {
env.generate_ssl_ca_cert()?;
}
// create endpoints dir
fs::create_dir_all(env.endpoints_path())?;
// create safekeeper dirs
for safekeeper in &env.safekeepers {
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
SafekeeperNode::from_env(&env, safekeeper)
.initialize()
.context("safekeeper init failed")?;
}
// initialize pageserver state
@@ -929,80 +851,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
}
Ok(())
}
fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
// openssl req -x509 -newkey rsa:2048 -nodes -subj "/CN=Neon Local CA" -days 36500 \
// -out rootCA.crt -keyout rootCA.key
let keygen_output = Command::new("openssl")
.args([
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
])
.args(["-subj", "/CN=Neon Local CA"])
.args(["-out", cert_path.to_str().unwrap()])
.args(["-keyout", key_path.to_str().unwrap()])
.output()
.context("failed to generate CA certificate")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
Ok(())
}
fn generate_ssl_cert(
cert_path: &Path,
key_path: &Path,
ca_cert_path: &Path,
ca_key_path: &Path,
) -> anyhow::Result<()> {
// Generate Certificate Signing Request (CSR).
let mut csr_path = cert_path.to_path_buf();
csr_path.set_extension(".csr");
// openssl req -new -nodes -newkey rsa:2048 -keyout server.key -out server.csr \
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
let keygen_output = Command::new("openssl")
.args(["req", "-new", "-nodes"])
.args(["-newkey", "rsa:2048"])
.args(["-subj", "/CN=localhost"])
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
.args(["-keyout", key_path.to_str().unwrap()])
.args(["-out", csr_path.to_str().unwrap()])
.output()
.context("failed to generate CSR")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
// Sign CSR with CA key.
//
// openssl x509 -req -in server.csr -CA rootCA.crt -CAkey rootCA.key -CAcreateserial \
// -out server.crt -days 36500 -copy_extensions copyall
let keygen_output = Command::new("openssl")
.args(["x509", "-req"])
.args(["-in", csr_path.to_str().unwrap()])
.args(["-CA", ca_cert_path.to_str().unwrap()])
.args(["-CAkey", ca_key_path.to_str().unwrap()])
.arg("-CAcreateserial")
.args(["-out", cert_path.to_str().unwrap()])
.args(["-days", "36500"])
.args(["-copy_extensions", "copyall"])
.output()
.context("failed to sign CSR")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
// Remove CSR file as it's not needed anymore.
fs::remove_file(csr_path)?;
Ok(())
}

View File

@@ -21,7 +21,6 @@ use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
use postgres_connection::{PgConnectionConfig, parse_host_port};
use reqwest::Certificate;
use utils::auth::{Claims, Scope};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -50,29 +49,12 @@ impl PageServerNode {
let (host, port) =
parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let port = port.unwrap_or(5432);
let ssl_ca_cert = env.ssl_ca_cert_path().map(|ssl_ca_file| {
let buf = std::fs::read(ssl_ca_file).expect("SSL root CA file should exist");
Certificate::from_pem(&buf).expect("CA certificate should be valid")
});
let endpoint = if env.storage_controller.use_https_pageserver_api {
format!(
"https://{}",
conf.listen_https_addr.as_ref().expect(
"listen https address should be specified if use_https_pageserver_api is on"
)
)
} else {
format!("http://{}", conf.listen_http_addr)
};
Self {
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
conf: conf.clone(),
env: env.clone(),
http_client: mgmt_api::Client::new(
endpoint,
format!("http://{}", conf.listen_http_addr),
{
match conf.http_auth_type {
AuthType::Trust => None,
@@ -83,9 +65,7 @@ impl PageServerNode {
}
}
.as_deref(),
ssl_ca_cert,
)
.expect("Client constructs with no errors"),
),
}
}
@@ -240,13 +220,6 @@ impl PageServerNode {
.context("write identity toml")?;
drop(identity_toml);
if self.env.generate_local_ssl_certs {
self.env.generate_ssl_cert(
datadir.join("server.crt").as_path(),
datadir.join("server.key").as_path(),
)?;
}
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
// Write metadata file, used by pageserver on startup to register itself with
@@ -257,15 +230,6 @@ impl PageServerNode {
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
let http_port = http_port.unwrap_or(9898);
let https_port = match self.conf.listen_https_addr.as_ref() {
Some(https_addr) => {
let (_https_host, https_port) =
parse_host_port(https_addr).expect("Unable to parse listen_https_addr");
Some(https_port.unwrap_or(9899))
}
None => None,
};
// Intentionally hand-craft JSON: this acts as an implicit format compat test
// in case the pageserver-side structure is edited, and reflects the real life
// situation: the metadata is written by some other script.
@@ -276,7 +240,6 @@ impl PageServerNode {
postgres_port: self.pg_connection_config.port(),
http_host: "localhost".to_string(),
http_port,
https_port,
other: HashMap::from([(
"availability_zone_id".to_string(),
serde_json::json!(az_id),

View File

@@ -111,18 +111,6 @@ impl SafekeeperNode {
.expect("non-Unicode path")
}
/// Initializes a safekeeper node by creating all necessary files,
/// e.g. SSL certificates.
pub fn initialize(&self) -> anyhow::Result<()> {
if self.env.generate_local_ssl_certs {
self.env.generate_ssl_cert(
&self.datadir_path().join("server.crt"),
&self.datadir_path().join("server.key"),
)?;
}
Ok(())
}
pub async fn start(
&self,
extra_opts: &[String],
@@ -208,16 +196,6 @@ impl SafekeeperNode {
]);
}
if let Some(https_port) = self.conf.https_port {
args.extend([
"--listen-https".to_owned(),
format!("{}:{}", self.listen_addr, https_port),
]);
}
if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
}
args.extend_from_slice(extra_opts);
background_process::start_process(

View File

@@ -12,10 +12,13 @@ use hyper0::Uri;
use nix::unistd::Pid;
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
TenantCreateResponse, TenantLocateResponse,
TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
TenantShardMigrateResponse,
};
use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
use pageserver_api::shard::TenantShardId;
use pageserver_api::models::{
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use reqwest::Method;
@@ -534,18 +537,6 @@ impl StorageController {
args.push("--start-as-candidate".to_string());
}
if self.config.use_https_pageserver_api {
args.push("--use-https-pageserver-api".to_string());
}
if self.config.use_https_safekeeper_api {
args.push("--use-https-safekeeper-api".to_string());
}
if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
}
if let Some(private_key) = &self.private_key {
let claims = Claims::new(None, Scope::PageServerApi);
let jwt_token =
@@ -562,8 +553,10 @@ impl StorageController {
args.push(format!("--public-key=\"{public_key}\""));
}
if let Some(control_plane_hooks_api) = &self.env.control_plane_hooks_api {
args.push(format!("--control-plane-url={control_plane_hooks_api}"));
if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
args.push(format!(
"--compute-hook-url={control_plane_compute_hook_api}"
));
}
if let Some(split_threshold) = self.config.split_threshold.as_ref() {
@@ -586,10 +579,6 @@ impl StorageController {
self.env.base_data_dir.display()
));
if self.config.timelines_onto_safekeepers {
args.push("--timelines-onto-safekeepers".to_string());
}
background_process::start_process(
COMMAND,
&instance_dir,
@@ -836,6 +825,41 @@ impl StorageController {
.await
}
#[instrument(skip(self))]
pub async fn tenant_migrate(
&self,
tenant_shard_id: TenantShardId,
node_id: NodeId,
) -> anyhow::Result<TenantShardMigrateResponse> {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(TenantShardMigrateRequest {
node_id,
migration_config: None,
}),
)
.await
}
#[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
pub async fn tenant_split(
&self,
tenant_id: TenantId,
new_shard_count: u8,
new_stripe_size: Option<ShardStripeSize>,
) -> anyhow::Result<TenantShardSplitResponse> {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/shard_split"),
Some(TenantShardSplitRequest {
new_shard_count,
new_stripe_size,
}),
)
.await
}
#[instrument(skip_all, fields(node_id=%req.node_id))]
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
@@ -880,9 +904,4 @@ impl StorageController {
)
.await
}
pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> anyhow::Result<()> {
self.dispatch(Method::PUT, "v1/tenant/config".to_string(), Some(req))
.await
}
}

View File

@@ -1,21 +1,20 @@
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
use clap::{Parser, Subcommand};
use futures::StreamExt;
use pageserver_api::controller_api::{
AvailabilityZone, MigrationConfig, NodeAvailabilityWrapper, NodeConfigureRequest,
NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse,
PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
TenantShardMigrateRequest, TenantShardMigrateResponse,
AvailabilityZone, NodeAvailabilityWrapper, NodeConfigureRequest, NodeDescribeResponse,
NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse, PlacementPolicy,
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest,
TenantDescribeResponse, TenantPolicyRequest, TenantShardMigrateRequest,
TenantShardMigrateResponse,
};
use pageserver_api::models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest,
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters,
TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest,
TenantShardSplitResponse,
};
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
@@ -113,15 +112,6 @@ enum Command {
tenant_shard_id: TenantShardId,
#[arg(long)]
node: NodeId,
#[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
prewarm: bool,
#[arg(long, default_value_t = false, action = clap::ArgAction::Set)]
override_scheduler: bool,
},
/// Watch the location of a tenant shard evolve, e.g. while expecting it to migrate
TenantShardWatch {
#[arg(long)]
tenant_shard_id: TenantShardId,
},
/// Migrate the secondary location for a tenant shard to a specific pageserver.
TenantShardMigrateSecondary {
@@ -158,6 +148,12 @@ enum Command {
#[arg(long)]
tenant_id: TenantId,
},
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
/// mode so that it can warm up content on a pageserver.
TenantWarmup {
#[arg(long)]
tenant_id: TenantId,
},
TenantSetPreferredAz {
#[arg(long)]
tenant_id: TenantId,
@@ -273,10 +269,6 @@ struct Cli {
/// a token with both scopes to use with this tool.
jwt: Option<String>,
#[arg(long)]
/// Trusted root CA certificate to use in https APIs.
ssl_ca_file: Option<PathBuf>,
#[command(subcommand)]
command: Command,
}
@@ -387,17 +379,9 @@ async fn main() -> anyhow::Result<()> {
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
let ssl_ca_cert = match &cli.ssl_ca_file {
Some(ssl_ca_file) => {
let buf = tokio::fs::read(ssl_ca_file).await?;
Some(reqwest::Certificate::from_pem(&buf)?)
}
None => None,
};
let mut trimmed = cli.api.to_string();
trimmed.pop();
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref(), ssl_ca_cert)?;
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
match cli.command {
Command::NodeRegister {
@@ -635,43 +619,19 @@ async fn main() -> anyhow::Result<()> {
Command::TenantShardMigrate {
tenant_shard_id,
node,
prewarm,
override_scheduler,
} => {
let migration_config = MigrationConfig {
prewarm,
override_scheduler,
..Default::default()
};
let req = TenantShardMigrateRequest {
node_id: node,
origin_node_id: None,
migration_config,
migration_config: None,
};
match storcon_client
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(req),
)
.await
{
Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) => {
anyhow::bail!(
"Migration to {node} rejected, may require `--force` ({}) ",
msg
);
}
Err(e) => return Err(e.into()),
Ok(_) => {}
}
watch_tenant_shard(storcon_client, tenant_shard_id, Some(node)).await?;
}
Command::TenantShardWatch { tenant_shard_id } => {
watch_tenant_shard(storcon_client, tenant_shard_id, None).await?;
.await?;
}
Command::TenantShardMigrateSecondary {
tenant_shard_id,
@@ -679,8 +639,7 @@ async fn main() -> anyhow::Result<()> {
} => {
let req = TenantShardMigrateRequest {
node_id: node,
origin_node_id: None,
migration_config: MigrationConfig::default(),
migration_config: None,
};
storcon_client
@@ -865,6 +824,94 @@ async fn main() -> anyhow::Result<()> {
)
.await?;
}
Command::TenantWarmup { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await;
match describe_response {
Ok(describe) => {
if matches!(describe.policy, PlacementPolicy::Secondary) {
// Fine: it's already known to controller in secondary mode: calling
// again to put it into secondary mode won't cause problems.
} else {
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
}
}
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
// Fine: this tenant isn't know to the storage controller yet.
}
Err(e) => {
// Unexpected API error
return Err(e.into());
}
}
vps_client
.location_config(
TenantShardId::unsharded(tenant_id),
pageserver_api::models::LocationConfig {
mode: pageserver_api::models::LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: 0,
shard_count: 0,
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
tenant_conf: TenantConfig::default(),
},
None,
true,
)
.await?;
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let secondary_ps_id = describe_response
.shards
.first()
.unwrap()
.node_secondary
.first()
.unwrap();
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
loop {
let (status, progress) = vps_client
.tenant_secondary_download(
TenantShardId::unsharded(tenant_id),
Some(Duration::from_secs(10)),
)
.await?;
println!(
"Progress: {}/{} layers, {}/{} bytes",
progress.layers_downloaded,
progress.layers_total,
progress.bytes_downloaded,
progress.bytes_total
);
match status {
StatusCode::OK => {
println!("Download complete");
break;
}
StatusCode::ACCEPTED => {
// Loop
}
_ => {
anyhow::bail!("Unexpected download status: {status}");
}
}
}
}
Command::TenantDrop { tenant_id, unclean } => {
if !unclean {
anyhow::bail!(
@@ -1058,8 +1105,7 @@ async fn main() -> anyhow::Result<()> {
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
Some(TenantShardMigrateRequest {
node_id: mv.to,
origin_node_id: Some(mv.from),
migration_config: MigrationConfig::default(),
migration_config: None,
}),
)
.await
@@ -1238,68 +1284,3 @@ async fn main() -> anyhow::Result<()> {
Ok(())
}
static WATCH_INTERVAL: Duration = Duration::from_secs(5);
async fn watch_tenant_shard(
storcon_client: Client,
tenant_shard_id: TenantShardId,
until_migrated_to: Option<NodeId>,
) -> anyhow::Result<()> {
if let Some(until_migrated_to) = until_migrated_to {
println!(
"Waiting for tenant shard {} to be migrated to node {}",
tenant_shard_id, until_migrated_to
);
}
loop {
let desc = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{}", tenant_shard_id.tenant_id),
None,
)
.await?;
// Output the current state of the tenant shard
let shard = desc
.shards
.iter()
.find(|s| s.tenant_shard_id == tenant_shard_id)
.ok_or(anyhow::anyhow!("Tenant shard not found"))?;
let summary = format!(
"attached: {} secondary: {} {}",
shard
.node_attached
.map(|n| format!("{}", n))
.unwrap_or("none".to_string()),
shard
.node_secondary
.iter()
.map(|n| n.to_string())
.collect::<Vec<_>>()
.join(","),
if shard.is_reconciling {
"(reconciler active)"
} else {
"(reconciler idle)"
}
);
println!("{}", summary);
// Maybe drop out if we finished migration
if let Some(until_migrated_to) = until_migrated_to {
if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling {
println!(
"Tenant shard {} is now on node {}",
tenant_shard_id, until_migrated_to
);
break;
}
}
tokio::time::sleep(WATCH_INTERVAL).await;
}
Ok(())
}

View File

@@ -27,10 +27,6 @@ yanked = "warn"
id = "RUSTSEC-2023-0071"
reason = "the marvin attack only affects private key decryption, not public key signature verification"
[[advisories.ignore]]
id = "RUSTSEC-2024-0436"
reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact."
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html

View File

@@ -1,7 +1,3 @@
# Neon RFCs
## Overview
This directory contains Request for Comments documents, or RFCs, for
features or concepts that have been proposed. Alternative names:
technical design doc, ERD, one-pager
@@ -63,10 +59,37 @@ RFC lifecycle:
### RFC template
Use template with `YYYY-MM-DD-copy-me.md` as a starting point. Timestamp prefix helps to avoid awkward 'id' collisions.
```sh
cp docs/rfcs/YYYY-MM-DD-copy-me.md docs/rfcs/$(date +"%Y-%m-%d")-<name>.md
```
Note, a lot of the sections are marked as if relevant. They are included into the template as a reminder and to help inspiration.
```
# Name
Created on ..
Implemented on ..
## Summary
## Motivation
## Non Goals (if relevant)
## Impacted components (e.g. pageserver, safekeeper, console, etc)
## Proposed implementation
### Reliability, failure modes and corner cases (if relevant)
### Interaction/Sequence diagram (if relevant)
### Scalability (if relevant)
### Security implications (if relevant)
### Unresolved questions (if relevant)
## Alternative implementation (if relevant)
## Pros/cons of proposed approaches (if relevant)
## Definition of Done (if relevant)
```

View File

@@ -1,30 +0,0 @@
# Name
Created on YYYY-MM-DD
Implemented on _TBD_
## Summary
## Motivation
## Non Goals (if relevant)
## Impacted components (e.g. pageserver, safekeeper, console, etc)
## Proposed implementation
### Reliability, failure modes and corner cases (if relevant)
### Interaction/Sequence diagram (if relevant)
### Scalability (if relevant)
### Security implications (if relevant)
### Unresolved questions (if relevant)
## Alternative implementation (if relevant)
## Pros/cons of proposed approaches (if relevant)
## Definition of Done (if relevant)

View File

@@ -101,25 +101,15 @@ changes such as a pageserver node becoming unavailable, or the tenant's shard co
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
location changes.
The hook is configured using the storage controller's `--control-plane-url` CLI option, from which the hook URL is computed.
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
Currently, there is two hooks, each computed by appending the name to the provided control plane URL prefix:
- `notify-attach`, called whenever attachment for pageservers changes
- `notify-safekeepers`, called whenever attachment for safekeepers changes
If the hooks require JWT auth, the token may be provided with `--control-plane-jwt-token`.
The hooks will be invoked with a `PUT` request.
In the Neon cloud service, these hooks are implemented by Neon's internal cloud control plane. In `neon_local` systems,
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
the compute hook.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hooks. This is not complicated.
### `notify-attach` body
The `notify-attach` request body follows the format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
```
struct ComputeHookNotifyRequestShard {
@@ -138,15 +128,15 @@ When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.pageserver_connstring` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
shards identified by `NodeId` must be converted to the address+port of the node.
- if stripe_size is not None, set `neon.shard_stripe_size` to this value
- if stripe_size is not None, set `neon.stripe_size` to this value
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..
Example body:
### Example notification body
```
{
@@ -158,34 +148,3 @@ Example body:
],
}
```
### `notify-safekeepers` body
The `notify-safekeepers` request body forllows the format of the `SafekeepersNotifyRequest` structure, provided below for convenience.
```
pub struct SafekeeperInfo {
pub id: NodeId,
pub hostname: String,
}
pub struct SafekeepersNotifyRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub generation: u32,
pub safekeepers: Vec<SafekeeperInfo>,
}
```
When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.safekeeper_connstrings` to an array of postgres connection strings to safekeepers according to the `safekeepers` list. The
safekeepers identified by `NodeId` must be converted to the address+port of the respective safekeeper.
The hostname is provided for debugging purposes, so we reserve changes to how we pass it.
- set `neon.safekeepers_generation` to the provided `generation` value.
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..

View File

@@ -7,7 +7,6 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
chrono.workspace = true
indexmap.workspace = true
jsonwebtoken.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -30,9 +30,3 @@ pub struct SetRoleGrantsRequest {
pub privileges: Vec<Privilege>,
pub role: PgIdent,
}
/// Request of the /configure_telemetry API
#[derive(Debug, Deserialize, Serialize)]
pub struct ConfigureTelemetryRequest {
pub logs_export_host: Option<String>,
}

View File

@@ -139,7 +139,6 @@ pub struct ComputeCtlConfig {
/// Set of JSON web keys that the compute can use to authenticate
/// communication from the control plane.
pub jwks: JwkSet,
pub tls: Option<TlsConfig>,
}
impl Default for ComputeCtlConfig {
@@ -148,17 +147,10 @@ impl Default for ComputeCtlConfig {
jwks: JwkSet {
keys: Vec::default(),
},
tls: None,
}
}
}
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct TlsConfig {
pub key_path: String,
pub cert_path: String,
}
/// Response of the `/computes/{compute_id}/spec` control-plane API.
#[derive(Deserialize, Debug)]
pub struct ControlPlaneSpecResponse {

View File

@@ -5,15 +5,12 @@
//! and connect it to the storage nodes.
use std::collections::HashMap;
use indexmap::IndexMap;
use regex::Regex;
use remote_storage::RemotePath;
use serde::{Deserialize, Serialize};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use crate::responses::TlsConfig;
/// String type alias representing Postgres identifier and
/// intended to be used for DB / role names.
pub type PgIdent = String;
@@ -128,7 +125,7 @@ pub struct ComputeSpec {
// information about available remote extensions
pub remote_extensions: Option<RemoteExtSpec>,
pub pgbouncer_settings: Option<IndexMap<String, String>>,
pub pgbouncer_settings: Option<HashMap<String, String>>,
// Stripe size for pageserver sharding, in pages
#[serde(default)]
@@ -179,8 +176,8 @@ pub enum ComputeFeature {
/// track short-lived connections as user activity.
ActivityMonitorExperimental,
/// Allow to configure rsyslog for Postgres logs export
PostgresLogsExport,
/// Pre-install and initialize anon extension for every database in the cluster
AnonExtension,
/// This is a special feature flag that is used to represent unknown feature flags.
/// Basically all unknown to enum flags are represented as this one. See unit test
@@ -360,9 +357,6 @@ pub struct LocalProxySpec {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub jwks: Option<Vec<JwksSettings>>,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub tls: Option<TlsConfig>,
}
#[derive(Clone, Debug, Deserialize, Serialize)]

View File

@@ -208,6 +208,7 @@
],
"remote_extensions": {
"library_index": {
"anon": "anon",
"postgis-3": "postgis",
"libpgrouting-3.4": "postgis",
"postgis_raster-3": "postgis",
@@ -216,6 +217,12 @@
"address_standardizer-3": "postgis"
},
"extension_data": {
"anon": {
"archive_path": "5834329303/v15/extensions/anon.tar.zst",
"control_data": {
"anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n"
}
},
"postgis": {
"archive_path": "5834329303/v15/extensions/postgis.tar.zst",
"control_data": {
@@ -231,6 +238,7 @@
}
},
"custom_extensions": [
"anon"
],
"public_extensions": [
"postgis"

View File

@@ -158,6 +158,7 @@ mod reliable_copy_test {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
utils::logging::OtelEnablement::Disabled,
utils::logging::Output::Stdout,
)
.expect("logging init failed");

View File

@@ -7,9 +7,7 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
bytes.workspace = true
camino.workspace = true
fail.workspace = true
futures.workspace = true
hyper0.workspace = true
itertools.workspace = true
jemalloc_pprof.workspace = true
@@ -17,14 +15,12 @@ once_cell.workspace = true
pprof.workspace = true
regex.workspace = true
routerify.workspace = true
rustls-pemfile.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_path_to_error.workspace = true
thiserror.workspace = true
tracing.workspace = true
tokio.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
url.workspace = true
uuid.workspace = true

View File

@@ -399,10 +399,12 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
// Return the report in the requested format.
match format {
Format::Pprof => {
let body = report
let mut body = Vec::new();
report
.pprof()
.map_err(|err| ApiError::InternalServerError(err.into()))?
.encode_to_vec();
.write_to_vec(&mut body)
.map_err(|err| ApiError::InternalServerError(err.into()))?;
Response::builder()
.status(200)

View File

@@ -3,11 +3,9 @@ pub mod error;
pub mod failpoints;
pub mod json;
pub mod request;
pub mod server;
pub mod tls_certs;
extern crate hyper0 as hyper;
/// Current fast way to apply simple http routing in various Neon binaries.
/// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
pub use routerify::{RequestServiceBuilder, RouterBuilder, RouterService, ext::RequestExt};
pub use routerify::{RouterBuilder, RouterService, ext::RequestExt};

View File

@@ -1,155 +0,0 @@
use std::{error::Error, sync::Arc};
use futures::StreamExt;
use futures::stream::FuturesUnordered;
use hyper0::Body;
use hyper0::server::conn::Http;
use routerify::{RequestService, RequestServiceBuilder};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor;
use tokio_util::sync::CancellationToken;
use tracing::{error, info};
use crate::error::ApiError;
/// A simple HTTP server over hyper library.
/// You may want to use it instead of [`hyper0::server::Server`] because:
/// 1. hyper0's Server was removed from hyper v1.
/// It's recommended to replace hyepr0's Server with a manual loop, which is done here.
/// 2. hyper0's Server doesn't support TLS out of the box, and there is no way
/// to support it efficiently with the Accept trait that hyper0's Server uses.
/// That's one of the reasons why it was removed from v1.
/// <https://github.com/hyperium/hyper/blob/115339d3df50f20c8717680aa35f48858e9a6205/docs/ROADMAP.md#higher-level-client-and-server-problems>
pub struct Server {
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
listener: tokio::net::TcpListener,
tls_acceptor: Option<TlsAcceptor>,
}
impl Server {
pub fn new(
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
listener: std::net::TcpListener,
tls_acceptor: Option<TlsAcceptor>,
) -> anyhow::Result<Self> {
// Note: caller of from_std is responsible for setting nonblocking mode.
listener.set_nonblocking(true)?;
let listener = tokio::net::TcpListener::from_std(listener)?;
Ok(Self {
request_service,
listener,
tls_acceptor,
})
}
pub async fn serve(self, cancel: CancellationToken) -> anyhow::Result<()> {
fn suppress_io_error(err: &std::io::Error) -> bool {
use std::io::ErrorKind::*;
matches!(err.kind(), ConnectionReset | ConnectionAborted | BrokenPipe)
}
fn suppress_hyper_error(err: &hyper0::Error) -> bool {
if err.is_incomplete_message() || err.is_closed() || err.is_timeout() {
return true;
}
if let Some(inner) = err.source() {
if let Some(io) = inner.downcast_ref::<std::io::Error>() {
return suppress_io_error(io);
}
}
false
}
let mut connections = FuturesUnordered::new();
loop {
tokio::select! {
stream = self.listener.accept() => {
let (tcp_stream, remote_addr) = match stream {
Ok(stream) => stream,
Err(err) => {
if !suppress_io_error(&err) {
info!("Failed to accept TCP connection: {err:#}");
}
continue;
}
};
let service = self.request_service.build(remote_addr);
let tls_acceptor = self.tls_acceptor.clone();
let cancel = cancel.clone();
connections.push(tokio::spawn(
async move {
match tls_acceptor {
Some(tls_acceptor) => {
// Handle HTTPS connection.
let tls_stream = tokio::select! {
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
_ = cancel.cancelled() => return,
};
let tls_stream = match tls_stream {
Ok(tls_stream) => tls_stream,
Err(err) => {
if !suppress_io_error(&err) {
info!("Failed to accept TLS connection: {err:#}");
}
return;
}
};
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
if !suppress_hyper_error(&err) {
info!("Failed to serve HTTPS connection: {err:#}");
}
}
}
None => {
// Handle HTTP connection.
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
if !suppress_hyper_error(&err) {
info!("Failed to serve HTTP connection: {err:#}");
}
}
}
};
}));
}
Some(conn) = connections.next() => {
if let Err(err) = conn {
error!("Connection panicked: {err:#}");
}
}
_ = cancel.cancelled() => {
// Wait for graceful shutdown of all connections.
while let Some(conn) = connections.next().await {
if let Err(err) = conn {
error!("Connection panicked: {err:#}");
}
}
break;
}
}
}
Ok(())
}
/// Serves HTTP connection with graceful shutdown.
async fn serve_connection<I>(
io: I,
service: RequestService<Body, ApiError>,
cancel: CancellationToken,
) -> Result<(), hyper0::Error>
where
I: AsyncRead + AsyncWrite + Unpin + Send + 'static,
{
let mut conn = Http::new().serve_connection(io, service).with_upgrades();
tokio::select! {
res = &mut conn => res,
_ = cancel.cancelled() => {
Pin::new(&mut conn).graceful_shutdown();
// Note: connection should still be awaited for graceful shutdown to complete.
conn.await
}
}
}
}

View File

@@ -1,21 +0,0 @@
use camino::Utf8Path;
use tokio_rustls::rustls::pki_types::{CertificateDer, PrivateKeyDer};
pub fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
let file = std::fs::File::open(filename)?;
let mut reader = std::io::BufReader::new(file);
Ok(rustls_pemfile::certs(&mut reader).collect::<Result<Vec<_>, _>>()?)
}
pub fn load_private_key(filename: &Utf8Path) -> anyhow::Result<PrivateKeyDer<'static>> {
let file = std::fs::File::open(filename)?;
let mut reader = std::io::BufReader::new(file);
let key = rustls_pemfile::private_key(&mut reader)?;
key.ok_or(anyhow::anyhow!(
"no private key found in {}",
filename.as_str(),
))
}

View File

@@ -34,6 +34,7 @@ postgres_backend.workspace = true
nix = {workspace = true, optional = true}
reqwest.workspace = true
rand.workspace = true
tracing-utils.workspace = true
[dev-dependencies]
bincode.workspace = true

View File

@@ -35,7 +35,6 @@ pub struct NodeMetadata {
pub postgres_port: u16,
pub http_host: String,
pub http_port: u16,
pub https_port: Option<u16>,
// Deployment tools may write fields to the metadata file beyond what we
// use in this type: this type intentionally only names fields that require.
@@ -58,9 +57,6 @@ pub struct ConfigToml {
// types mapped 1:1 into the runtime PageServerConfig type
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub ssl_key_file: Utf8PathBuf,
pub ssl_cert_file: Utf8PathBuf,
pub availability_zone: Option<String>,
#[serde(with = "humantime_serde")]
pub wait_lsn_timeout: Duration,
@@ -131,6 +127,7 @@ pub struct ConfigToml {
pub load_previous_heatmap: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub generate_unarchival_heatmap: Option<bool>,
pub tracing: Option<Tracing>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -188,6 +185,58 @@ pub enum GetVectoredConcurrentIo {
SidecarTask,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct Ratio {
pub numerator: usize,
pub denominator: usize,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct OtelExporterConfig {
pub endpoint: String,
pub protocol: OtelExporterProtocol,
#[serde(with = "humantime_serde")]
pub timeout: Duration,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum OtelExporterProtocol {
Grpc,
HttpBinary,
HttpJson,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "kebab-case")]
pub struct Tracing {
pub sampling_ratio: Ratio,
pub export_config: OtelExporterConfig,
}
impl From<&OtelExporterConfig> for tracing_utils::ExportConfig {
fn from(val: &OtelExporterConfig) -> Self {
tracing_utils::ExportConfig {
endpoint: Some(val.endpoint.clone()),
protocol: val.protocol.into(),
timeout: val.timeout,
}
}
}
impl From<OtelExporterProtocol> for tracing_utils::Protocol {
fn from(val: OtelExporterProtocol) -> Self {
match val {
OtelExporterProtocol::Grpc => tracing_utils::Protocol::Grpc,
OtelExporterProtocol::HttpJson => tracing_utils::Protocol::HttpJson,
OtelExporterProtocol::HttpBinary => tracing_utils::Protocol::HttpBinary,
}
}
}
pub mod statvfs {
pub mod mock {
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -272,16 +321,15 @@ pub struct TenantConfigToml {
/// size exceeds `compaction_upper_limit * checkpoint_distance`.
pub compaction_upper_limit: usize,
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
/// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
/// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
/// If true, compact down L0 across all tenant timelines before doing regular compaction.
pub compaction_l0_first: bool,
/// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only
/// has an effect if `compaction_l0_first` is true. Defaults to true.
/// has an effect if `compaction_l0_first` is `true`.
pub compaction_l0_semaphore: bool,
/// Level0 delta layer threshold at which to delay layer flushes such that they take 2x as long,
/// and block on layer flushes during ephemeral layer rolls, for compaction backpressure. This
/// helps compaction keep up with WAL ingestion, and avoids read amplification blowing up.
/// Should be >compaction_threshold. 0 to disable. Defaults to 3x compaction_threshold.
/// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
/// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
/// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
/// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default.
pub l0_flush_delay_threshold: Option<usize>,
/// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
/// to avoid deadlock. 0 to disable. Disabled by default.
@@ -289,8 +337,6 @@ pub struct TenantConfigToml {
/// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next
/// layer. This is a temporary backpressure mechanism which should be removed once
/// l0_flush_{delay,stall}_threshold is fully enabled.
///
/// TODO: this is no longer enabled, remove it when the config option is no longer set.
pub l0_flush_wait_upload: bool,
// Determines how much history is retained, to allow
// branching and read replicas at an older point in time.
@@ -428,9 +474,6 @@ pub mod defaults {
pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
utils::postgres_client::PostgresClientProtocol::Vanilla;
pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
}
impl Default for ConfigToml {
@@ -440,9 +483,6 @@ impl Default for ConfigToml {
Self {
listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
listen_https_addr: (None),
ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE),
ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE),
availability_zone: (None),
wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
.expect("cannot parse default wait lsn timeout")),
@@ -542,6 +582,7 @@ impl Default for ConfigToml {
validate_wal_contiguity: None,
load_previous_heatmap: None,
generate_unarchival_heatmap: None,
tracing: None,
}
}
}
@@ -570,15 +611,13 @@ pub mod tenant_conf_defaults {
// be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
// with this config, we can get a maximum peak compaction usage of 9 GB.
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
// Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
// read amp.
pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
crate::models::CompactionAlgorithm::Legacy;
pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false;
pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true;
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
@@ -589,8 +628,9 @@ pub mod tenant_conf_defaults {
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
// If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
// layer creation will end immediately. Set to 0 to disable.
pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
// layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we
// want to enable this feature.
pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0;
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";

View File

@@ -16,30 +16,6 @@ fn test_node_metadata_v1_backward_compatibilty() {
postgres_port: 23,
http_host: "localhost".to_string(),
http_port: 42,
https_port: None,
other: HashMap::new(),
}
)
}
#[test]
fn test_node_metadata_v2_backward_compatibilty() {
let v2 = serde_json::to_vec(&serde_json::json!({
"host": "localhost",
"port": 23,
"http_host": "localhost",
"http_port": 42,
"https_port": 123,
}));
assert_eq!(
serde_json::from_slice::<NodeMetadata>(&v2.unwrap()).unwrap(),
NodeMetadata {
postgres_host: "localhost".to_string(),
postgres_port: 23,
http_host: "localhost".to_string(),
http_port: 42,
https_port: Some(123),
other: HashMap::new(),
}
)

View File

@@ -182,66 +182,20 @@ pub struct TenantDescribeResponseShard {
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
/// Optionally, callers may specify the node they are migrating _from_, and the server will
/// reject the request if the shard is no longer attached there: this enables writing safer
/// clients that don't risk fighting with some other movement of the shard.
#[serde(default)]
pub origin_node_id: Option<NodeId>,
#[serde(default)]
pub migration_config: MigrationConfig,
pub migration_config: Option<MigrationConfig>,
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
#[derive(Serialize, Deserialize, Debug)]
pub struct MigrationConfig {
/// If true, the migration will be executed even if it is to a location with a sub-optimal scheduling
/// score: this is usually not what you want, and if you use this then you'll also need to set the
/// tenant's scheduling policy to Essential or Pause to avoid the optimiser reverting your migration.
///
/// Default: false
#[serde(default)]
pub override_scheduler: bool,
/// If true, the migration will be done gracefully by creating a secondary location first and
/// waiting for it to warm up before cutting over. If false, if there is no existing secondary
/// location at the destination, the tenant will be migrated immediately. If the tenant's data
/// can't be downloaded within [`Self::secondary_warmup_timeout`], then the migration will go
/// ahead but run with a cold cache that can severely reduce performance until it warms up.
///
/// When doing a graceful migration, the migration API returns as soon as it is started.
///
/// Default: true
#[serde(default = "default_prewarm")]
pub prewarm: bool,
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
/// overall for secondary warmup before cutting over
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_warmup_timeout: Option<Duration>,
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
/// within each secondary download poll call to pageserver.
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_download_request_timeout: Option<Duration>,
}
fn default_prewarm() -> bool {
true
}
impl Default for MigrationConfig {
fn default() -> Self {
Self {
override_scheduler: false,
prewarm: default_prewarm(),
secondary_warmup_timeout: None,
secondary_download_request_timeout: None,
}
}
}
#[derive(Serialize, Clone, Debug)]
#[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability {
@@ -489,7 +443,6 @@ pub struct SafekeeperDescribeResponse {
pub host: String,
pub port: i32,
pub http_port: i32,
pub https_port: Option<i32>,
pub availability_zone_id: String,
pub scheduling_policy: SkSchedulingPolicy,
}
@@ -534,43 +487,4 @@ mod test {
err
);
}
/// Check that a minimal migrate request with no config results in the expected default settings
#[test]
fn test_migrate_request_decode_defaults() {
let json = r#"{
"node_id": 123
}"#;
let request: TenantShardMigrateRequest = serde_json::from_str(json).unwrap();
assert_eq!(request.node_id, NodeId(123));
assert_eq!(request.origin_node_id, None);
assert!(!request.migration_config.override_scheduler);
assert!(request.migration_config.prewarm);
assert_eq!(request.migration_config.secondary_warmup_timeout, None);
assert_eq!(
request.migration_config.secondary_download_request_timeout,
None
);
}
/// Check that a partially specified migration config results in the expected default settings
#[test]
fn test_migration_config_decode_defaults() {
// Specify just one field of the config
let json = r#"{
}"#;
let config: MigrationConfig = serde_json::from_str(json).unwrap();
// Check each field's expected default value
assert!(!config.override_scheduler);
assert!(config.prewarm);
assert_eq!(config.secondary_warmup_timeout, None);
assert_eq!(config.secondary_download_request_timeout, None);
assert_eq!(config.secondary_warmup_timeout, None);
// Consistency check that the Default impl agrees with our serde defaults
assert_eq!(MigrationConfig::default(), config);
}
}

View File

@@ -176,39 +176,6 @@ impl LsnLease {
}
}
/// Controls the detach ancestor behavior.
/// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point.
/// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all.
#[derive(Debug, Clone, Copy, Default)]
pub enum DetachBehavior {
#[default]
NoAncestorAndReparent,
MultiLevelAndNoReparent,
}
impl std::str::FromStr for DetachBehavior {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"no_ancestor_and_reparent" => Ok(DetachBehavior::NoAncestorAndReparent),
"multi_level_and_no_reparent" => Ok(DetachBehavior::MultiLevelAndNoReparent),
"v1" => Ok(DetachBehavior::NoAncestorAndReparent),
"v2" => Ok(DetachBehavior::MultiLevelAndNoReparent),
_ => Err("cannot parse detach behavior"),
}
}
}
impl std::fmt::Display for DetachBehavior {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
DetachBehavior::NoAncestorAndReparent => write!(f, "no_ancestor_and_reparent"),
DetachBehavior::MultiLevelAndNoReparent => write!(f, "multi_level_and_no_reparent"),
}
}
}
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
///
/// XXX: We used to have more variants here, but now it's just one, which makes this rather
@@ -307,31 +274,6 @@ pub struct TimelineCreateRequest {
pub mode: TimelineCreateRequestMode,
}
/// Storage controller specific extensions to [`TimelineInfo`].
#[derive(Serialize, Deserialize, Clone)]
pub struct TimelineCreateResponseStorcon {
#[serde(flatten)]
pub timeline_info: TimelineInfo,
pub safekeepers: Option<SafekeepersInfo>,
}
/// Safekeepers as returned in timeline creation request to storcon or pushed to
/// cplane in the post migration hook.
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeepersInfo {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub generation: u32,
pub safekeepers: Vec<SafekeeperInfo>,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeeperInfo {
pub id: NodeId,
pub hostname: String,
}
#[derive(Serialize, Deserialize, Clone)]
#[serde(untagged)]
pub enum TimelineCreateRequestMode {
@@ -1204,15 +1146,6 @@ pub struct TimelineArchivalConfigRequest {
pub state: TimelineArchivalState,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct TimelinePatchIndexPartRequest {
pub rel_size_migration: Option<RelSizeMigration>,
pub gc_compaction_last_completed_lsn: Option<Lsn>,
pub applied_gc_cutoff_lsn: Option<Lsn>,
#[serde(default)]
pub force_index_update: bool,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelinesInfoAndOffloaded {
pub timelines: Vec<TimelineInfo>,
@@ -1258,10 +1191,9 @@ pub struct TimelineInfo {
pub last_record_lsn: Lsn,
pub prev_record_lsn: Option<Lsn>,
/// Legacy field, retained for one version to enable old storage controller to
/// decode (it was a mandatory field).
#[serde(default, rename = "latest_gc_cutoff_lsn")]
pub _unused: Lsn,
/// Legacy field for compat with control plane. Synonym of `min_readable_lsn`.
/// TODO: remove once control plane no longer reads it.
pub latest_gc_cutoff_lsn: Lsn,
/// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
/// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
@@ -1510,14 +1442,8 @@ pub struct TenantScanRemoteStorageResponse {
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "snake_case")]
pub enum TenantSorting {
/// Total size of layers on local disk for all timelines in a shard.
ResidentSize,
/// The logical size of the largest timeline within a _tenant_ (not shard). Only tracked on
/// shard 0, contains the sum across all shards.
MaxLogicalSize,
/// The logical size of the largest timeline within a _tenant_ (not shard), divided by number of
/// shards. Only tracked on shard 0, and estimates the per-shard logical size.
MaxLogicalSizePerShard,
}
impl Default for TenantSorting {
@@ -1547,20 +1473,14 @@ pub struct TopTenantShardsRequest {
pub struct TopTenantShardItem {
pub id: TenantShardId,
/// Total size of layers on local disk for all timelines in this shard.
/// Total size of layers on local disk for all timelines in this tenant
pub resident_size: u64,
/// Total size of layers in remote storage for all timelines in this shard.
/// Total size of layers in remote storage for all timelines in this tenant
pub physical_size: u64,
/// The largest logical size of a timeline within this _tenant_ (not shard). This is only
/// tracked on shard 0, and contains the sum of the logical size across all shards.
/// The largest logical size of a timeline within this tenant
pub max_logical_size: u64,
/// The largest logical size of a timeline within this _tenant_ (not shard) divided by number of
/// shards. This is only tracked on shard 0, and is only an estimate as we divide it evenly by
/// shard count, rounded up.
pub max_logical_size_per_shard: u64,
}
#[derive(Serialize, Deserialize, Debug, Default)]

View File

@@ -112,16 +112,6 @@ impl ShardIdentity {
}
}
/// An unsharded identity with the given stripe size (if non-zero). This is typically used to
/// carry over a stripe size for an unsharded tenant from persistent storage.
pub fn unsharded_with_stripe_size(stripe_size: ShardStripeSize) -> Self {
let mut shard_identity = Self::unsharded();
if stripe_size.0 > 0 {
shard_identity.stripe_size = stripe_size;
}
shard_identity
}
/// A broken instance of this type is only used for `TenantState::Broken` tenants,
/// which are constructed in code paths that don't have access to proper configuration.
///

View File

@@ -396,14 +396,6 @@ pub mod waldecoder {
self.lsn + self.inputbuf.remaining() as u64
}
/// Returns the LSN up to which the WAL decoder has processed.
///
/// If [`Self::poll_decode`] returned a record, then this will return
/// the end LSN of said record.
pub fn lsn(&self) -> Lsn {
self.lsn
}
pub fn feed_bytes(&mut self, buf: &[u8]) {
self.inputbuf.extend_from_slice(buf);
}

View File

@@ -135,8 +135,8 @@ impl Type {
pub enum Kind {
/// A simple type like `VARCHAR` or `INTEGER`.
Simple,
/// An enumerated type.
Enum,
/// An enumerated type along with its variants.
Enum(Vec<String>),
/// A pseudo-type.
Pseudo,
/// An array type along with the type of its elements.
@@ -146,9 +146,9 @@ pub enum Kind {
/// A multirange type along with the type of its elements.
Multirange(Type),
/// A domain type along with its underlying type.
Domain(Oid),
/// A composite type.
Composite(Oid),
Domain(Type),
/// A composite type along with information about its fields.
Composite(Vec<Field>),
}
/// Information about a field of a composite type.

View File

@@ -19,10 +19,10 @@ use crate::config::{Host, SslMode};
use crate::connection::{Request, RequestMessages};
use crate::query::RowStream;
use crate::simple_query::SimpleQueryStream;
use crate::types::{Oid, Type};
use crate::types::{Oid, ToSql, Type};
use crate::{
CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Statement, Transaction,
TransactionBuilder, query, simple_query,
CancelToken, Error, ReadyForQueryStatus, Row, SimpleQueryMessage, Statement, Transaction,
TransactionBuilder, query, simple_query, slice_iter,
};
pub struct Responses {
@@ -54,18 +54,26 @@ impl Responses {
/// A cache of type info and prepared statements for fetching type info
/// (corresponding to the queries in the [crate::prepare] module).
#[derive(Default)]
pub(crate) struct CachedTypeInfo {
struct CachedTypeInfo {
/// A statement for basic information for a type from its
/// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
/// fallback).
pub(crate) typeinfo: Option<Statement>,
typeinfo: Option<Statement>,
/// A statement for getting information for a composite type from its OID.
/// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY).
typeinfo_composite: Option<Statement>,
/// A statement for getting information for a composite type from its OID.
/// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or
/// its fallback).
typeinfo_enum: Option<Statement>,
/// Cache of types already looked up.
pub(crate) types: HashMap<Oid, Type>,
types: HashMap<Oid, Type>,
}
pub struct InnerClient {
sender: mpsc::UnboundedSender<Request>,
cached_typeinfo: Mutex<CachedTypeInfo>,
/// A buffer to use when writing out postgres commands.
buffer: Mutex<BytesMut>,
@@ -83,6 +91,38 @@ impl InnerClient {
})
}
pub fn typeinfo(&self) -> Option<Statement> {
self.cached_typeinfo.lock().typeinfo.clone()
}
pub fn set_typeinfo(&self, statement: &Statement) {
self.cached_typeinfo.lock().typeinfo = Some(statement.clone());
}
pub fn typeinfo_composite(&self) -> Option<Statement> {
self.cached_typeinfo.lock().typeinfo_composite.clone()
}
pub fn set_typeinfo_composite(&self, statement: &Statement) {
self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone());
}
pub fn typeinfo_enum(&self) -> Option<Statement> {
self.cached_typeinfo.lock().typeinfo_enum.clone()
}
pub fn set_typeinfo_enum(&self, statement: &Statement) {
self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone());
}
pub fn type_(&self, oid: Oid) -> Option<Type> {
self.cached_typeinfo.lock().types.get(&oid).cloned()
}
pub fn set_type(&self, oid: Oid, type_: &Type) {
self.cached_typeinfo.lock().types.insert(oid, type_.clone());
}
/// Call the given function with a buffer to be used when writing out
/// postgres commands.
pub fn with_buf<F, R>(&self, f: F) -> R
@@ -102,6 +142,7 @@ pub struct SocketConfig {
pub host: Host,
pub port: u16,
pub connect_timeout: Option<Duration>,
// pub keepalive: Option<KeepaliveConfig>,
}
/// An asynchronous PostgreSQL client.
@@ -110,7 +151,6 @@ pub struct SocketConfig {
/// through this client object.
pub struct Client {
inner: Arc<InnerClient>,
cached_typeinfo: CachedTypeInfo,
socket_config: SocketConfig,
ssl_mode: SslMode,
@@ -129,9 +169,9 @@ impl Client {
Client {
inner: Arc::new(InnerClient {
sender,
cached_typeinfo: Default::default(),
buffer: Default::default(),
}),
cached_typeinfo: Default::default(),
socket_config,
ssl_mode,
@@ -149,6 +189,55 @@ impl Client {
&self.inner
}
/// Executes a statement, returning a vector of the resulting rows.
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
/// provided, 1-indexed.
///
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
/// with the `prepare` method.
///
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
pub async fn query(
&self,
statement: Statement,
params: &[&(dyn ToSql + Sync)],
) -> Result<Vec<Row>, Error> {
self.query_raw(statement, slice_iter(params))
.await?
.try_collect()
.await
}
/// The maximally flexible version of [`query`].
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
/// provided, 1-indexed.
///
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
/// with the `prepare` method.
///
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
///
/// [`query`]: #method.query
pub async fn query_raw<'a, I>(
&self,
statement: Statement,
params: I,
) -> Result<RowStream, Error>
where
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
I::IntoIter: ExactSizeIterator,
{
query::query(&self.inner, statement, params).await
}
/// Pass text directly to the Postgres backend to allow it to sort out typing itself and
/// to save a roundtrip
pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
@@ -195,14 +284,6 @@ impl Client {
simple_query::batch_execute(self.inner(), query).await
}
pub async fn discard_all(&mut self) -> Result<ReadyForQueryStatus, Error> {
// clear the prepared statements that are about to be nuked from the postgres session
self.cached_typeinfo.typeinfo = None;
self.batch_execute("discard all").await
}
/// Begins a new database transaction.
///
/// The transaction will roll back by default - use the `commit` method to commit it.
@@ -266,8 +347,8 @@ impl Client {
}
/// Query for type information
pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result<Type, Error> {
crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await
pub async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
crate::prepare::get_type(&self.inner, oid).await
}
/// Determines if the connection to the server has already closed.

View File

@@ -22,7 +22,7 @@ pub trait GenericClient: private::Sealed {
I::IntoIter: ExactSizeIterator + Sync + Send;
/// Query for type information
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error>;
async fn get_type(&self, oid: Oid) -> Result<Type, Error>;
}
impl private::Sealed for Client {}
@@ -38,8 +38,8 @@ impl GenericClient for Client {
}
/// Query for type information
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
self.get_type_inner(oid).await
async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
crate::prepare::get_type(self.inner(), oid).await
}
}
@@ -56,7 +56,7 @@ impl GenericClient for Transaction<'_> {
}
/// Query for type information
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
self.client_mut().get_type(oid).await
async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
self.client().get_type(oid).await
}
}

View File

@@ -9,10 +9,10 @@ use log::debug;
use postgres_protocol2::message::backend::Message;
use postgres_protocol2::message::frontend;
use crate::client::{CachedTypeInfo, InnerClient};
use crate::client::InnerClient;
use crate::codec::FrontendMessage;
use crate::connection::RequestMessages;
use crate::types::{Kind, Oid, Type};
use crate::types::{Field, Kind, Oid, Type};
use crate::{Column, Error, Statement, query, slice_iter};
pub(crate) const TYPEINFO_QUERY: &str = "\
@@ -23,7 +23,23 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
WHERE t.oid = $1
";
async fn prepare_typecheck(
const TYPEINFO_ENUM_QUERY: &str = "\
SELECT enumlabel
FROM pg_catalog.pg_enum
WHERE enumtypid = $1
ORDER BY enumsortorder
";
pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
SELECT attname, atttypid
FROM pg_catalog.pg_attribute
WHERE attrelid = $1
AND NOT attisdropped
AND attnum > 0
ORDER BY attnum
";
pub async fn prepare(
client: &Arc<InnerClient>,
name: &'static str,
query: &str,
@@ -51,7 +67,7 @@ async fn prepare_typecheck(
let mut parameters = vec![];
let mut it = parameter_description.parameters();
while let Some(oid) = it.next().map_err(Error::parse)? {
let type_ = Type::from_oid(oid).ok_or_else(Error::unexpected_message)?;
let type_ = get_type(client, oid).await?;
parameters.push(type_);
}
@@ -59,7 +75,7 @@ async fn prepare_typecheck(
if let Some(row_description) = row_description {
let mut it = row_description.fields();
while let Some(field) = it.next().map_err(Error::parse)? {
let type_ = Type::from_oid(field.type_oid()).ok_or_else(Error::unexpected_message)?;
let type_ = get_type(client, field.type_oid()).await?;
let column = Column::new(field.name().to_string(), type_, field);
columns.push(column);
}
@@ -68,6 +84,15 @@ async fn prepare_typecheck(
Ok(Statement::new(client, name, parameters, columns))
}
fn prepare_rec<'a>(
client: &'a Arc<InnerClient>,
name: &'static str,
query: &'a str,
types: &'a [Type],
) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
Box::pin(prepare(client, name, query, types))
}
fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
if types.is_empty() {
debug!("preparing query {}: {}", name, query);
@@ -83,20 +108,16 @@ fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Resu
})
}
pub async fn get_type(
client: &Arc<InnerClient>,
typecache: &mut CachedTypeInfo,
oid: Oid,
) -> Result<Type, Error> {
pub async fn get_type(client: &Arc<InnerClient>, oid: Oid) -> Result<Type, Error> {
if let Some(type_) = Type::from_oid(oid) {
return Ok(type_);
}
if let Some(type_) = typecache.types.get(&oid) {
return Ok(type_.clone());
};
if let Some(type_) = client.type_(oid) {
return Ok(type_);
}
let stmt = typeinfo_statement(client, typecache).await?;
let stmt = typeinfo_statement(client).await?;
let rows = query::query(client, stmt, slice_iter(&[&oid])).await?;
pin_mut!(rows);
@@ -115,48 +136,100 @@ pub async fn get_type(
let relid: Oid = row.try_get(6)?;
let kind = if type_ == b'e' as i8 {
Kind::Enum
let variants = get_enum_variants(client, oid).await?;
Kind::Enum(variants)
} else if type_ == b'p' as i8 {
Kind::Pseudo
} else if basetype != 0 {
Kind::Domain(basetype)
let type_ = get_type_rec(client, basetype).await?;
Kind::Domain(type_)
} else if elem_oid != 0 {
let type_ = get_type_rec(client, typecache, elem_oid).await?;
let type_ = get_type_rec(client, elem_oid).await?;
Kind::Array(type_)
} else if relid != 0 {
Kind::Composite(relid)
let fields = get_composite_fields(client, relid).await?;
Kind::Composite(fields)
} else if let Some(rngsubtype) = rngsubtype {
let type_ = get_type_rec(client, typecache, rngsubtype).await?;
let type_ = get_type_rec(client, rngsubtype).await?;
Kind::Range(type_)
} else {
Kind::Simple
};
let type_ = Type::new(name, oid, kind, schema);
typecache.types.insert(oid, type_.clone());
client.set_type(oid, &type_);
Ok(type_)
}
fn get_type_rec<'a>(
client: &'a Arc<InnerClient>,
typecache: &'a mut CachedTypeInfo,
oid: Oid,
) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + 'a>> {
Box::pin(get_type(client, typecache, oid))
Box::pin(get_type(client, oid))
}
async fn typeinfo_statement(
client: &Arc<InnerClient>,
typecache: &mut CachedTypeInfo,
) -> Result<Statement, Error> {
if let Some(stmt) = &typecache.typeinfo {
return Ok(stmt.clone());
async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
if let Some(stmt) = client.typeinfo() {
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo";
let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
typecache.typeinfo = Some(stmt.clone());
client.set_typeinfo(&stmt);
Ok(stmt)
}
async fn get_enum_variants(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<String>, Error> {
let stmt = typeinfo_enum_statement(client).await?;
query::query(client, stmt, slice_iter(&[&oid]))
.await?
.and_then(|row| async move { row.try_get(0) })
.try_collect()
.await
}
async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
if let Some(stmt) = client.typeinfo_enum() {
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo_enum";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?;
client.set_typeinfo_enum(&stmt);
Ok(stmt)
}
async fn get_composite_fields(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<Field>, Error> {
let stmt = typeinfo_composite_statement(client).await?;
let rows = query::query(client, stmt, slice_iter(&[&oid]))
.await?
.try_collect::<Vec<_>>()
.await?;
let mut fields = vec![];
for row in rows {
let name = row.try_get(0)?;
let oid = row.try_get(1)?;
let type_ = get_type_rec(client, oid).await?;
fields.push(Field::new(name, type_));
}
Ok(fields)
}
async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
if let Some(stmt) = client.typeinfo_composite() {
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo_composite";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
client.set_typeinfo_composite(&stmt);
Ok(stmt)
}

View File

@@ -72,9 +72,4 @@ impl<'a> Transaction<'a> {
pub fn client(&self) -> &Client {
self.client
}
/// Returns a reference to the underlying `Client`.
pub fn client_mut(&mut self) -> &mut Client {
self.client
}
}

View File

@@ -208,6 +208,7 @@ pub(crate) fn ensure_logging_ready() {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
utils::logging::OtelEnablement::Disabled,
utils::logging::Output::Stdout,
)
.expect("logging init failed");

View File

@@ -131,14 +131,6 @@ impl Configuration {
}
}
pub fn new(members: MemberSet) -> Self {
Configuration {
generation: INITIAL_GENERATION,
members,
new_members: None,
}
}
/// Is `sk_id` member of the configuration?
pub fn contains(&self, sk_id: NodeId) -> bool {
self.members.contains(sk_id) || self.new_members.as_ref().is_some_and(|m| m.contains(sk_id))

View File

@@ -18,7 +18,7 @@ pub struct SafekeeperStatus {
pub id: NodeId,
}
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
@@ -221,11 +221,6 @@ pub struct TimelineMembershipSwitchResponse {
pub current_conf: Configuration,
}
#[derive(Clone, Copy, Serialize, Deserialize)]
pub struct TimelineDeleteResult {
pub dir_existed: bool,
}
fn lsn_invalid() -> Lsn {
Lsn::INVALID
}
@@ -288,7 +283,7 @@ pub struct SafekeeperUtilization {
}
/// pull_timeline request body.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize)]
pub struct PullTimelineRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,

View File

@@ -14,6 +14,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
pin-project-lite.workspace = true
[dev-dependencies]
tracing-subscriber.workspace = true # For examples in docs

View File

@@ -31,19 +31,17 @@
//! .init();
//! }
//! ```
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod http;
pub mod perf_span;
use opentelemetry::KeyValue;
use opentelemetry::trace::TracerProvider;
use opentelemetry_otlp::WithExportConfig;
pub use opentelemetry_otlp::{ExportConfig, Protocol};
use tracing::level_filters::LevelFilter;
use tracing::{Dispatch, Subscriber};
use tracing::Subscriber;
use tracing_subscriber::Layer;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::registry::LookupSpan;
/// Set up OpenTelemetry exporter, using configuration from environment variables.
@@ -135,9 +133,7 @@ fn init_tracing_internal<S>(service_name: String, export_config: ExportConfig) -
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
// Sets up exporter from the provided [`ExportConfig`] parameter.
// If the endpoint is not specified, it is loaded from the
// OTEL_EXPORTER_OTLP_ENDPOINT environment variable.
// Sets up exporter from the OTEL_EXPORTER_* environment variables.
let exporter = opentelemetry_otlp::SpanExporter::builder()
.with_http()
.with_export_config(export_config)
@@ -170,51 +166,3 @@ where
pub fn shutdown_tracing() {
opentelemetry::global::shutdown_tracer_provider();
}
pub enum OtelEnablement {
Disabled,
Enabled {
service_name: String,
export_config: ExportConfig,
runtime: &'static tokio::runtime::Runtime,
},
}
pub struct OtelGuard {
pub dispatch: Dispatch,
}
impl Drop for OtelGuard {
fn drop(&mut self) {
shutdown_tracing();
}
}
/// Initializes OTEL infrastructure for performance tracing according to the provided configuration
///
/// Performance tracing is handled by a different [`tracing::Subscriber`]. This functions returns
/// an [`OtelGuard`] containing a [`tracing::Dispatch`] associated with a newly created subscriber.
/// Applications should use this dispatch for their performance traces.
///
/// The lifetime of the guard should match taht of the application. On drop, it tears down the
/// OTEL infra.
pub fn init_performance_tracing(otel_enablement: OtelEnablement) -> Option<OtelGuard> {
let otel_subscriber = match otel_enablement {
OtelEnablement::Disabled => None,
OtelEnablement::Enabled {
service_name,
export_config,
runtime,
} => {
let otel_layer = runtime
.block_on(init_tracing(&service_name, export_config))
.with_filter(LevelFilter::INFO);
let otel_subscriber = tracing_subscriber::registry().with(otel_layer);
let otel_dispatch = Dispatch::new(otel_subscriber);
Some(otel_dispatch)
}
};
otel_subscriber.map(|dispatch| OtelGuard { dispatch })
}

View File

@@ -0,0 +1,149 @@
//! Crutch module to work around tracing infrastructure deficiencies
//!
//! We wish to collect granular request spans without impacting performance
//! by much. Ideally, we should have zero overhead for a sampling rate of 0.
//!
//! The approach taken by the pageserver crate is to use a completely different
//! span hierarchy for the performance spans. Spans are explicitly stored in
//! the request context and use a different [`tracing::Subscriber`] in order
//! to avoid expensive filtering.
//!
//! [`tracing::Span`] instances record their [`tracing::Dispatch`] and, implcitly,
//! their [`tracing::Subscriber`] at creation time. However, upon exiting the span,
//! the global default [`tracing::Dispatch`] is used. This is problematic if one
//! wishes to juggle different subscribers.
//!
//! In order to work around this, this module provides a [`PerfSpan`] type which
//! wraps a [`Span`] and sets the default subscriber when exiting the span. This
//! achieves the correct routing.
//!
//! There's also a modified version of [`tracing::Instrument`] which works with
//! [`PerfSpan`].
use core::{
future::Future,
marker::Sized,
mem::ManuallyDrop,
pin::Pin,
task::{Context, Poll},
};
use pin_project_lite::pin_project;
use tracing::{Dispatch, field, span::Span};
#[derive(Debug, Clone)]
pub struct PerfSpan {
inner: ManuallyDrop<Span>,
dispatch: Dispatch,
}
#[must_use = "once a span has been entered, it should be exited"]
pub struct PerfSpanEntered<'a> {
span: &'a PerfSpan,
}
impl PerfSpan {
pub fn new(span: Span, dispatch: Dispatch) -> Self {
Self {
inner: ManuallyDrop::new(span),
dispatch,
}
}
pub fn record<Q: field::AsField + ?Sized, V: field::Value>(
&self,
field: &Q,
value: V,
) -> &Self {
self.inner.record(field, value);
self
}
pub fn enter(&self) -> PerfSpanEntered {
PerfSpanEntered { span: self }
}
pub fn inner(&self) -> &Span {
&self.inner
}
}
impl Drop for PerfSpan {
fn drop(&mut self) {
// Bring the desired dispatch into scope before explicitly calling
// the span destructor. This routes the span exit to the correct
// [`tracing::Subscriber`].
let _dispatch_guard = tracing::dispatcher::set_default(&self.dispatch);
// SAFETY: ManuallyDrop in Drop implementation
unsafe { ManuallyDrop::drop(&mut self.inner) }
}
}
impl Drop for PerfSpanEntered<'_> {
fn drop(&mut self) {
assert!(self.span.inner.id().is_some());
let _dispatch_guard = tracing::dispatcher::set_default(&self.span.dispatch);
self.span.dispatch.exit(&self.span.inner.id().unwrap());
}
}
pub trait PerfInstrument: Sized {
fn instrument(self, span: PerfSpan) -> PerfInstrumented<Self> {
PerfInstrumented {
inner: ManuallyDrop::new(self),
span,
}
}
}
pin_project! {
#[project = PerfInstrumentedProj]
#[derive(Debug, Clone)]
#[must_use = "futures do nothing unless you `.await` or poll them"]
pub struct PerfInstrumented<T> {
// `ManuallyDrop` is used here to to enter instrument `Drop` by entering
// `Span` and executing `ManuallyDrop::drop`.
#[pin]
inner: ManuallyDrop<T>,
span: PerfSpan,
}
impl<T> PinnedDrop for PerfInstrumented<T> {
fn drop(this: Pin<&mut Self>) {
let this = this.project();
let _enter = this.span.enter();
// SAFETY: 1. `Pin::get_unchecked_mut()` is safe, because this isn't
// different from wrapping `T` in `Option` and calling
// `Pin::set(&mut this.inner, None)`, except avoiding
// additional memory overhead.
// 2. `ManuallyDrop::drop()` is safe, because
// `PinnedDrop::drop()` is guaranteed to be called only
// once.
unsafe { ManuallyDrop::drop(this.inner.get_unchecked_mut()) }
}
}
}
impl<'a, T> PerfInstrumentedProj<'a, T> {
/// Get a mutable reference to the [`Span`] a pinned mutable reference to
/// the wrapped type.
fn span_and_inner_pin_mut(self) -> (&'a mut PerfSpan, Pin<&'a mut T>) {
// SAFETY: As long as `ManuallyDrop<T>` does not move, `T` won't move
// and `inner` is valid, because `ManuallyDrop::drop` is called
// only inside `Drop` of the `Instrumented`.
let inner = unsafe { self.inner.map_unchecked_mut(|v| &mut **v) };
(self.span, inner)
}
}
impl<T: Future> Future for PerfInstrumented<T> {
type Output = T::Output;
fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
let (span, inner) = self.project().span_and_inner_pin_mut();
let _enter = span.enter();
inner.poll(cx)
}
}
impl<T: Sized> PerfInstrument for T {}

View File

@@ -49,13 +49,7 @@ pub fn bench_log_slow(c: &mut Criterion) {
// performance too. Use a simple noop future that yields once, to avoid any scheduler fast
// paths for a ready future.
if enabled {
b.iter(|| {
runtime.block_on(log_slow(
"ready",
THRESHOLD,
std::pin::pin!(tokio::task::yield_now()),
))
});
b.iter(|| runtime.block_on(log_slow("ready", THRESHOLD, tokio::task::yield_now())));
} else {
b.iter(|| runtime.block_on(tokio::task::yield_now()));
}

View File

@@ -7,7 +7,9 @@ use metrics::{IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use strum_macros::{EnumString, VariantNames};
use tokio::time::Instant;
use tracing::Dispatch;
use tracing::info;
use tracing::level_filters::LevelFilter;
/// Logs a critical error, similarly to `tracing::error!`. This will:
///
@@ -125,6 +127,15 @@ pub enum TracingErrorLayerEnablement {
EnableWithRustLogFilter,
}
pub enum OtelEnablement {
Disabled,
Enabled {
service_name: String,
export_config: tracing_utils::ExportConfig,
runtime: &'static tokio::runtime::Runtime,
},
}
/// Where the logging should output to.
#[derive(Clone, Copy)]
pub enum Output {
@@ -132,11 +143,24 @@ pub enum Output {
Stderr,
}
pub struct OtelGuard {
pub dispatch: Dispatch,
}
impl Drop for OtelGuard {
fn drop(&mut self) {
tracing_utils::shutdown_tracing();
}
}
pub const PERF_TRACE_TARGET: &str = "P";
pub fn init(
log_format: LogFormat,
tracing_error_layer_enablement: TracingErrorLayerEnablement,
otel_enablement: OtelEnablement,
output: Output,
) -> anyhow::Result<()> {
) -> anyhow::Result<Option<OtelGuard>> {
// We fall back to printing all spans at info-level or above if
// the RUST_LOG environment variable is not set.
let rust_log_env_filter = || {
@@ -176,7 +200,26 @@ pub fn init(
TracingErrorLayerEnablement::Disabled => r.init(),
}
Ok(())
let otel_subscriber = match otel_enablement {
OtelEnablement::Disabled => None,
OtelEnablement::Enabled {
service_name,
export_config,
runtime,
} => {
let otel_layer = runtime
.block_on(tracing_utils::init_tracing(&service_name, export_config))
.with_filter(LevelFilter::INFO);
let otel_subscriber = tracing_subscriber::registry().with(otel_layer);
let otel_dispatch = Dispatch::new(otel_subscriber);
Some(otel_dispatch)
}
};
let otel_guard = otel_subscriber.map(|dispatch| OtelGuard { dispatch });
Ok(otel_guard)
}
/// Disable the default rust panic hook by using `set_hook`.
@@ -331,90 +374,37 @@ impl std::fmt::Debug for SecretString {
///
/// TODO: consider upgrading this to a warning, but currently it fires too often.
#[inline]
pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
where
F: Future<Output = O>,
{
monitor_slow_future(
threshold,
threshold, // period = threshold
f,
|MonitorSlowFutureCallback {
ready,
is_slow,
elapsed_total,
elapsed_since_last_callback: _,
}| {
if !is_slow {
return;
}
if ready {
info!(
"slow {name} completed after {:.3}s",
elapsed_total.as_secs_f64()
);
} else {
info!(
"slow {name} still running after {:.3}s",
elapsed_total.as_secs_f64()
);
}
},
)
.await
}
pub async fn log_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
// TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
// won't fit on the stack.
let mut f = Box::pin(f);
/// Poll future `fut` to completion, invoking callback `cb` at the given `threshold` and every
/// `period` afterwards, and also unconditionally when the future completes.
#[inline]
pub async fn monitor_slow_future<F, O>(
threshold: Duration,
period: Duration,
mut fut: std::pin::Pin<&mut F>,
mut cb: impl FnMut(MonitorSlowFutureCallback),
) -> O
where
F: Future<Output = O>,
{
let started = Instant::now();
let mut attempt = 1;
let mut last_cb = started;
loop {
// NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common
// case where the timeout doesn't fire.
let deadline = started + threshold + (attempt - 1) * period;
// TODO: still call the callback if the future panics? Copy how we do it for the page_service flush_in_progress counter.
let res = tokio::time::timeout_at(deadline, &mut fut).await;
let now = Instant::now();
let elapsed_total = now - started;
cb(MonitorSlowFutureCallback {
ready: res.is_ok(),
is_slow: elapsed_total >= threshold,
elapsed_total,
elapsed_since_last_callback: now - last_cb,
});
last_cb = now;
if let Ok(output) = res {
let deadline = started + attempt * threshold;
if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await {
// NB: we check if we exceeded the threshold even if the timeout never fired, because
// scheduling or execution delays may cause the future to succeed even if it exceeds the
// timeout. This costs an extra unconditional clock reading, but seems worth it to avoid
// false negatives.
let elapsed = started.elapsed();
if elapsed >= threshold {
info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
}
return output;
}
let elapsed = started.elapsed().as_secs_f64();
info!("slow {name} still running after {elapsed:.3}s",);
attempt += 1;
}
}
/// See [`monitor_slow_future`].
pub struct MonitorSlowFutureCallback {
/// Whether the future completed. If true, there will be no more callbacks.
pub ready: bool,
/// Whether the future is taking `>=` the specififed threshold duration to complete.
/// Monotonic: if true in one callback invocation, true in all subsequent onces.
pub is_slow: bool,
/// The time elapsed since the [`monitor_slow_future`] was first polled.
pub elapsed_total: Duration,
/// The time elapsed since the last callback invocation.
/// For the initial callback invocation, the time elapsed since the [`monitor_slow_future`] was first polled.
pub elapsed_since_last_callback: Duration,
}
#[cfg(test)]
mod tests {
use metrics::IntCounterVec;

View File

@@ -48,7 +48,6 @@ pprof.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true
rustls.workspace = true
scopeguard.workspace = true
send-future.workspace = true
serde.workspace = true
@@ -63,7 +62,6 @@ tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util"
tokio-epoll-uring.workspace = true
tokio-io-timeout.workspace = true
tokio-postgres.workspace = true
tokio-rustls.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
toml_edit = { workspace = true, features = [ "serde" ] }
@@ -101,7 +99,6 @@ criterion.workspace = true
hex-literal.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
indoc.workspace = true
uuid.workspace = true
[[bench]]
name = "bench_layer_map"
@@ -119,10 +116,6 @@ harness = false
name = "upload_queue"
harness = false
[[bench]]
name = "bench_metrics"
harness = false
[[bin]]
name = "test_helper_slow_client_reads"
required-features = [ "testing" ]

View File

@@ -1,366 +0,0 @@
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use utils::id::{TenantId, TimelineId};
//
// Demonstrates that repeat label values lookup is a multicore scalability bottleneck
// that is worth avoiding.
//
criterion_group!(
label_values,
label_values::bench_naive_usage,
label_values::bench_cache_label_values_lookup
);
mod label_values {
use super::*;
pub fn bench_naive_usage(c: &mut Criterion) {
let mut g = c.benchmark_group("label_values__naive_usage");
for ntimelines in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("ntimelines", ntimelines),
&ntimelines,
|b, ntimelines| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*ntimelines + 1);
let timelines = (0..*ntimelines)
.map(|_| {
(
TenantId::generate().to_string(),
"0000".to_string(),
TimelineId::generate().to_string(),
)
})
.collect::<Vec<_>>();
let metric_vec = metrics::UIntGaugeVec::new(
metrics::opts!("testmetric", "testhelp"),
&["tenant_id", "shard_id", "timeline_id"],
)
.unwrap();
std::thread::scope(|s| {
for (tenant_id, shard_id, timeline_id) in &timelines {
s.spawn(|| {
barrier.wait();
for _ in 0..iters {
metric_vec
.with_label_values(&[tenant_id, shard_id, timeline_id])
.inc();
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
pub fn bench_cache_label_values_lookup(c: &mut Criterion) {
let mut g = c.benchmark_group("label_values__cache_label_values_lookup");
for ntimelines in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("ntimelines", ntimelines),
&ntimelines,
|b, ntimelines| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*ntimelines + 1);
let timelines = (0..*ntimelines)
.map(|_| {
(
TenantId::generate().to_string(),
"0000".to_string(),
TimelineId::generate().to_string(),
)
})
.collect::<Vec<_>>();
let metric_vec = metrics::UIntGaugeVec::new(
metrics::opts!("testmetric", "testhelp"),
&["tenant_id", "shard_id", "timeline_id"],
)
.unwrap();
std::thread::scope(|s| {
for (tenant_id, shard_id, timeline_id) in &timelines {
s.spawn(|| {
let metric = metric_vec.with_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
barrier.wait();
for _ in 0..iters {
metric.inc();
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
}
//
// Demonstrates that even a single metric can be a scalability bottleneck
// if multiple threads in it concurrently but there's nothing we can do
// about it without changing the metrics framework to use e.g. sharded counte atomics.
//
criterion_group!(
single_metric_multicore_scalability,
single_metric_multicore_scalability::bench,
);
mod single_metric_multicore_scalability {
use super::*;
pub fn bench(c: &mut Criterion) {
let mut g = c.benchmark_group("single_metric_multicore_scalability");
for nthreads in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("nthreads", nthreads),
&nthreads,
|b, nthreads| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*nthreads + 1);
let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
std::thread::scope(|s| {
for _ in 0..*nthreads {
s.spawn(|| {
barrier.wait();
for _ in 0..iters {
metric.inc();
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
}
//
// Demonstrates that even if we cache label value, the propagation of such a cached metric value
// by Clone'ing it is a scalability bottleneck.
// The reason is that it's an Arc internally and thus there's contention on the reference count atomics.
//
// We can avoid that by having long-lived references per thread (= indirection).
//
criterion_group!(
propagation_of_cached_label_value,
propagation_of_cached_label_value::bench_naive,
propagation_of_cached_label_value::bench_long_lived_reference_per_thread,
);
mod propagation_of_cached_label_value {
use std::sync::Arc;
use super::*;
pub fn bench_naive(c: &mut Criterion) {
let mut g = c.benchmark_group("propagation_of_cached_label_value__naive");
for nthreads in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("nthreads", nthreads),
&nthreads,
|b, nthreads| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*nthreads + 1);
let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
std::thread::scope(|s| {
for _ in 0..*nthreads {
s.spawn(|| {
barrier.wait();
for _ in 0..iters {
// propagating the metric means we'd clone it into the child RequestContext
let propagated = metric.clone();
// simulate some work
criterion::black_box(propagated);
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
pub fn bench_long_lived_reference_per_thread(c: &mut Criterion) {
let mut g =
c.benchmark_group("propagation_of_cached_label_value__long_lived_reference_per_thread");
for nthreads in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("nthreads", nthreads),
&nthreads,
|b, nthreads| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*nthreads + 1);
let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
std::thread::scope(|s| {
for _ in 0..*nthreads {
s.spawn(|| {
// This is the technique.
let this_threads_metric_reference = Arc::new(metric.clone());
barrier.wait();
for _ in 0..iters {
// propagating the metric means we'd clone it into the child RequestContext
let propagated = Arc::clone(&this_threads_metric_reference);
// simulate some work (include the pointer chase!)
criterion::black_box(&*propagated);
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
}
}
criterion_main!(
label_values,
single_metric_multicore_scalability,
propagation_of_cached_label_value
);
/*
RUST_BACKTRACE=full cargo bench --bench bench_metrics -- --discard-baseline --noplot
Results on an im4gn.2xlarge instance
label_values__naive_usage/ntimelines/1 time: [178.71 ns 178.74 ns 178.76 ns]
label_values__naive_usage/ntimelines/4 time: [532.94 ns 539.59 ns 546.31 ns]
label_values__naive_usage/ntimelines/8 time: [1.1082 µs 1.1109 µs 1.1135 µs]
label_values__cache_label_values_lookup/ntimelines/1 time: [6.4116 ns 6.4119 ns 6.4123 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [6.3482 ns 6.3819 ns 6.4079 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [6.4213 ns 6.5279 ns 6.6293 ns]
single_metric_multicore_scalability/nthreads/1 time: [6.0102 ns 6.0104 ns 6.0106 ns]
single_metric_multicore_scalability/nthreads/4 time: [38.127 ns 38.275 ns 38.416 ns]
single_metric_multicore_scalability/nthreads/8 time: [73.698 ns 74.882 ns 75.864 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [14.424 ns 14.425 ns 14.426 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [100.71 ns 102.53 ns 104.35 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [211.50 ns 214.44 ns 216.87 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.135 ns 14.147 ns 14.160 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.243 ns 14.255 ns 14.268 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [14.470 ns 14.682 ns 14.895 ns]
Results on an i3en.3xlarge instance
label_values__naive_usage/ntimelines/1 time: [117.32 ns 117.53 ns 117.74 ns]
label_values__naive_usage/ntimelines/4 time: [736.58 ns 741.12 ns 745.61 ns]
label_values__naive_usage/ntimelines/8 time: [1.4513 µs 1.4596 µs 1.4665 µs]
label_values__cache_label_values_lookup/ntimelines/1 time: [8.0964 ns 8.0979 ns 8.0995 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [8.1620 ns 8.2912 ns 8.4491 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [14.148 ns 14.237 ns 14.324 ns]
single_metric_multicore_scalability/nthreads/1 time: [8.0993 ns 8.1013 ns 8.1046 ns]
single_metric_multicore_scalability/nthreads/4 time: [80.039 ns 80.672 ns 81.297 ns]
single_metric_multicore_scalability/nthreads/8 time: [153.58 ns 154.23 ns 154.90 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [13.924 ns 13.926 ns 13.928 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [143.66 ns 145.27 ns 146.59 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [296.51 ns 297.90 ns 299.30 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.013 ns 14.149 ns 14.308 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.311 ns 14.625 ns 14.984 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [25.981 ns 26.227 ns 26.476 ns]
Results on an Standard L16s v3 (16 vcpus, 128 GiB memory) Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
label_values__naive_usage/ntimelines/1 time: [101.63 ns 101.84 ns 102.06 ns]
label_values__naive_usage/ntimelines/4 time: [417.55 ns 424.73 ns 432.63 ns]
label_values__naive_usage/ntimelines/8 time: [874.91 ns 889.51 ns 904.25 ns]
label_values__cache_label_values_lookup/ntimelines/1 time: [5.7724 ns 5.7760 ns 5.7804 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [7.8878 ns 7.9401 ns 8.0034 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [7.2621 ns 7.6354 ns 8.0337 ns]
single_metric_multicore_scalability/nthreads/1 time: [5.7710 ns 5.7744 ns 5.7785 ns]
single_metric_multicore_scalability/nthreads/4 time: [66.629 ns 66.994 ns 67.336 ns]
single_metric_multicore_scalability/nthreads/8 time: [130.85 ns 131.98 ns 132.91 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [11.540 ns 11.546 ns 11.553 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [131.22 ns 131.90 ns 132.56 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [260.99 ns 262.75 ns 264.26 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [11.544 ns 11.550 ns 11.557 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [11.568 ns 11.642 ns 11.763 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [13.416 ns 14.121 ns 14.886 ns
Results on an M4 MAX MacBook Pro Total Number of Cores: 14 (10 performance and 4 efficiency)
label_values__naive_usage/ntimelines/1 time: [52.711 ns 53.026 ns 53.381 ns]
label_values__naive_usage/ntimelines/4 time: [323.99 ns 330.40 ns 337.53 ns]
label_values__naive_usage/ntimelines/8 time: [1.1615 µs 1.1998 µs 1.2399 µs]
label_values__cache_label_values_lookup/ntimelines/1 time: [1.6635 ns 1.6715 ns 1.6809 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [1.7786 ns 1.7876 ns 1.8028 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [1.8195 ns 1.8371 ns 1.8665 ns]
single_metric_multicore_scalability/nthreads/1 time: [1.7764 ns 1.7909 ns 1.8079 ns]
single_metric_multicore_scalability/nthreads/4 time: [33.875 ns 34.868 ns 35.923 ns]
single_metric_multicore_scalability/nthreads/8 time: [226.85 ns 235.30 ns 244.18 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [3.4337 ns 3.4491 ns 3.4660 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [69.486 ns 71.937 ns 74.472 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [434.87 ns 456.47 ns 477.84 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [3.3767 ns 3.3974 ns 3.4220 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [3.6105 ns 4.2355 ns 5.1463 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [4.0889 ns 4.9714 ns 6.0779 ns]
Results on a Hetzner AX102 AMD Ryzen 9 7950X3D 16-Core Processor
label_values__naive_usage/ntimelines/1 time: [64.510 ns 64.559 ns 64.610 ns]
label_values__naive_usage/ntimelines/4 time: [309.71 ns 326.09 ns 342.32 ns]
label_values__naive_usage/ntimelines/8 time: [776.92 ns 819.35 ns 856.93 ns]
label_values__cache_label_values_lookup/ntimelines/1 time: [1.2855 ns 1.2943 ns 1.3021 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [1.3865 ns 1.4139 ns 1.4441 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [1.5311 ns 1.5669 ns 1.6046 ns]
single_metric_multicore_scalability/nthreads/1 time: [1.1927 ns 1.1981 ns 1.2049 ns]
single_metric_multicore_scalability/nthreads/4 time: [24.346 ns 25.439 ns 26.634 ns]
single_metric_multicore_scalability/nthreads/8 time: [58.666 ns 60.137 ns 61.486 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [2.7067 ns 2.7238 ns 2.7402 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [62.723 ns 66.214 ns 69.787 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [164.24 ns 170.10 ns 175.68 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [2.2915 ns 2.2960 ns 2.3012 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [2.5726 ns 2.6158 ns 2.6624 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [2.7068 ns 2.8243 ns 2.9824 ns]
*/

View File

@@ -7,7 +7,7 @@ use http_utils::error::HttpErrorBody;
use pageserver_api::models::*;
use pageserver_api::shard::TenantShardId;
pub use reqwest::Body as ReqwestBody;
use reqwest::{Certificate, IntoUrl, Method, StatusCode, Url};
use reqwest::{IntoUrl, Method, StatusCode};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -38,9 +38,6 @@ pub enum Error {
#[error("Cancelled")]
Cancelled,
#[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
CreateClient(reqwest::Error),
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -72,17 +69,8 @@ pub enum ForceAwaitLogicalSize {
}
impl Client {
pub fn new(
mgmt_api_endpoint: String,
jwt: Option<&str>,
ssl_ca_cert: Option<Certificate>,
) -> Result<Self> {
let mut http_client = reqwest::Client::builder();
if let Some(ssl_ca_cert) = ssl_ca_cert {
http_client = http_client.add_root_certificate(ssl_ca_cert);
}
let http_client = http_client.build().map_err(Error::CreateClient)?;
Ok(Self::from_client(http_client, mgmt_api_endpoint, jwt))
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
}
pub fn from_client(
@@ -113,10 +101,12 @@ impl Client {
debug_assert!(path.starts_with('/'));
let uri = format!("{}{}", self.mgmt_api_endpoint, path);
let mut req = self.client.request(Method::GET, uri);
if let Some(value) = &self.authorization_header {
req = req.header(reqwest::header::AUTHORIZATION, value);
}
let req = self.client.request(Method::GET, uri);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
};
req.send().await.map_err(Error::ReceiveBody)
}
@@ -458,21 +448,13 @@ impl Client {
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
behavior: Option<DetachBehavior>,
) -> Result<AncestorDetached> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
self.mgmt_api_endpoint
);
let mut uri = Url::parse(&uri)
.map_err(|e| Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")))?;
if let Some(behavior) = behavior {
uri.query_pairs_mut()
.append_pair("detach_behavior", &behavior.to_string());
}
self.request(Method::PUT, uri, ())
self.request(Method::PUT, &uri, ())
.await?
.json()
.await

View File

@@ -10,6 +10,7 @@ pub(crate) fn setup_logging() {
logging::init(
logging::LogFormat::Test,
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
utils::logging::OtelEnablement::Disabled,
logging::Output::Stdout,
)
.expect("Failed to init test logging");

View File

@@ -117,6 +117,7 @@ async fn main() -> anyhow::Result<()> {
logging::init(
LogFormat::Plain,
TracingErrorLayerEnablement::EnableWithRustLogFilter,
utils::logging::OtelEnablement::Disabled,
logging::Output::Stdout,
)?;

View File

@@ -15,7 +15,6 @@ hdrhistogram.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
rand.workspace = true
reqwest.workspace=true
serde.workspace = true
serde_json.workspace = true
tracing.workspace = true

View File

@@ -36,8 +36,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
None, // TODO: support ssl_ca_file for https APIs in pagebench.
)?);
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(

View File

@@ -77,8 +77,7 @@ async fn main_impl(
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
None, // TODO: support ssl_ca_file for https APIs in pagebench.
)?);
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(

Some files were not shown because too many files have changed in this diff Show More