Compare commits

..

5 Commits

Author SHA1 Message Date
Anastasia Lubennikova
c755aafe76 Restart already running rsyslog to update configuration 2025-03-05 16:55:57 +00:00
Anastasia Lubennikova
43afb9e2f4 spawn rsyslog from sysvInit 2025-03-05 16:50:31 +00:00
Anastasia Lubennikova
8d7fde3b07 Fix start rsyslog code to run for read endpoints too 2025-03-05 15:13:37 +00:00
Anastasia Lubennikova
7327e65af7 exclud pg_catalog schema from audit logging 2025-03-05 12:56:32 +00:00
Anastasia Lubennikova
56efce2249 When the ComputeAuditLogLevel is
set to 'Hipaa':

- setup and configure
pgaudit and pgauditlogtofile extensions
in compute_ctl.

- spin up a rsyslog server in the compute VM,
and configure it to send logs to the endpoint
specified in AUDIT_LOGGING_ENDPOINT env.

Change pgaudit.log default to log 'all'.
exclude postgres database from audit logging:
we consider it system database that doesn't contain any sensitive data.

- add pgaudit, pgauditlogtofile to shared_preload_libraries
if audit_log_level Hipaa is enabled

Move rsyslog config to compute_rsyslog_template.conf

Set pgaudit.log_rotation_age
2025-03-05 12:27:39 +00:00
251 changed files with 3014 additions and 9341 deletions

View File

@@ -0,0 +1,21 @@
## Release 202Y-MM-DD
**NB: this PR must be merged only by 'Create a merge commit'!**
### Checklist when preparing for release
- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b)
- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?
<!-- List everything that should be done **before** release, any issues / setting changes / etc -->
### Checklist after release
- [ ] Make sure instructions from PRs included in this release and labeled `manual_release_instructions` are executed (either by you or by people who wrote them).
- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files))
- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1)
- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time)
<!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->

View File

@@ -33,5 +33,3 @@ config-variables:
- NEON_PROD_AWS_ACCOUNT_ID
- AWS_ECR_REGION
- BENCHMARK_LARGE_OLTP_PROJECTID
- SLACK_ON_CALL_DEVPROD_STREAM
- SLACK_RUST_CHANNEL_ID

View File

@@ -1,16 +1,14 @@
import itertools
import json
import os
import sys
source_tag = os.getenv("SOURCE_TAG")
target_tag = os.getenv("TARGET_TAG")
branch = os.getenv("BRANCH")
dev_acr = os.getenv("DEV_ACR")
prod_acr = os.getenv("PROD_ACR")
dev_aws = os.getenv("DEV_AWS")
prod_aws = os.getenv("PROD_AWS")
aws_region = os.getenv("AWS_REGION")
build_tag = os.environ["BUILD_TAG"]
branch = os.environ["BRANCH"]
dev_acr = os.environ["DEV_ACR"]
prod_acr = os.environ["PROD_ACR"]
dev_aws = os.environ["DEV_AWS"]
prod_aws = os.environ["PROD_AWS"]
aws_region = os.environ["AWS_REGION"]
components = {
"neon": ["neon"],
@@ -41,23 +39,24 @@ registries = {
outputs: dict[str, dict[str, list[str]]] = {}
target_tags = [target_tag, "latest"] if branch == "main" else [target_tag]
target_stages = (
["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"]
)
target_tags = [build_tag, "latest"] if branch == "main" else [build_tag]
target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"]
for component_name, component_images in components.items():
for stage in target_stages:
outputs[f"{component_name}-{stage}"] = {
f"docker.io/neondatabase/{component_image}:{source_tag}": [
f"{registry}/{component_image}:{tag}"
for registry, tag in itertools.product(registries[stage], target_tags)
if not (registry == "docker.io/neondatabase" and tag == source_tag)
outputs[f"{component_name}-{stage}"] = dict(
[
(
f"docker.io/neondatabase/{component_image}:{build_tag}",
[
f"{combo[0]}/{component_image}:{combo[1]}"
for combo in itertools.product(registries[stage], target_tags)
],
)
for component_image in component_images
]
for component_image in component_images
}
)
with open(os.getenv("GITHUB_OUTPUT", "/dev/null"), "a") as f:
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
for key, value in outputs.items():
f.write(f"{key}={json.dumps(value)}\n")
print(f"Image map for {key}:\n{json.dumps(value, indent=2)}\n\n", file=sys.stderr)

View File

@@ -1,110 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
DOCS_URL="https://docs.neon.build/overview/repositories/neon.html"
message() {
if [[ -n "${GITHUB_PR_NUMBER:-}" ]]; then
gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --edit-last --body "$1" \
|| gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --body "$1"
fi
echo "$1"
}
report_error() {
message "$1
For more details, see the documentation: ${DOCS_URL}"
exit 1
}
case "$RELEASE_BRANCH" in
"release") COMPONENT="Storage" ;;
"release-proxy") COMPONENT="Proxy" ;;
"release-compute") COMPONENT="Compute" ;;
*)
report_error "Unknown release branch: ${RELEASE_BRANCH}"
;;
esac
# Identify main and release branches
MAIN_BRANCH="origin/main"
REMOTE_RELEASE_BRANCH="origin/${RELEASE_BRANCH}"
# Find merge base
MERGE_BASE=$(git merge-base "${MAIN_BRANCH}" "${REMOTE_RELEASE_BRANCH}")
echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}"
# Get the HEAD commit (last commit in PR, expected to be the merge commit)
LAST_COMMIT=$(git rev-parse HEAD)
MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}")
EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$"
if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then
report_error "Merge commit message does not match expected pattern: '<component> release YYYY-MM-DD'
Expected component: ${COMPONENT}
Found: '${MERGE_COMMIT_MESSAGE}'"
fi
echo "✅ Merge commit message is correctly formatted: '${MERGE_COMMIT_MESSAGE}'"
LAST_COMMIT_PARENTS=$(git cat-file -p "${LAST_COMMIT}" | jq -sR '[capture("parent (?<parent>[0-9a-f]{40})"; "g") | .parent]')
if [[ "$(echo "${LAST_COMMIT_PARENTS}" | jq 'length')" -ne 2 ]]; then
report_error "Last commit must be a merge commit with exactly two parents"
fi
EXPECTED_RELEASE_HEAD=$(git rev-parse "${REMOTE_RELEASE_BRANCH}")
if echo "${LAST_COMMIT_PARENTS}" | jq -e --arg rel "${EXPECTED_RELEASE_HEAD}" 'index($rel) != null' > /dev/null; then
LINEAR_HEAD=$(echo "${LAST_COMMIT_PARENTS}" | jq -r '[.[] | select(. != $rel)][0]' --arg rel "${EXPECTED_RELEASE_HEAD}")
else
report_error "Last commit must merge the release branch (${RELEASE_BRANCH})"
fi
echo "✅ Last commit correctly merges the previous commit and the release branch"
echo "Top commit of linear history: ${LINEAR_HEAD}"
MERGE_COMMIT_TREE=$(git rev-parse "${LAST_COMMIT}^{tree}")
LINEAR_HEAD_TREE=$(git rev-parse "${LINEAR_HEAD}^{tree}")
if [[ "${MERGE_COMMIT_TREE}" != "${LINEAR_HEAD_TREE}" ]]; then
report_error "Tree of merge commit (${MERGE_COMMIT_TREE}) does not match tree of linear history head (${LINEAR_HEAD_TREE})
This indicates that the merge of ${RELEASE_BRANCH} into this branch was not performed using the merge strategy 'ours'"
fi
echo "✅ Merge commit tree matches the linear history head"
EXPECTED_PREVIOUS_COMMIT="${LINEAR_HEAD}"
# Now traverse down the history, ensuring each commit has exactly one parent
CURRENT_COMMIT="${EXPECTED_PREVIOUS_COMMIT}"
while [[ "${CURRENT_COMMIT}" != "${MERGE_BASE}" && "${CURRENT_COMMIT}" != "${EXPECTED_RELEASE_HEAD}" ]]; do
CURRENT_COMMIT_PARENTS=$(git cat-file -p "${CURRENT_COMMIT}" | jq -sR '[capture("parent (?<parent>[0-9a-f]{40})"; "g") | .parent]')
if [[ "$(echo "${CURRENT_COMMIT_PARENTS}" | jq 'length')" -ne 1 ]]; then
report_error "Commit ${CURRENT_COMMIT} must have exactly one parent"
fi
NEXT_COMMIT=$(echo "${CURRENT_COMMIT_PARENTS}" | jq -r '.[0]')
if [[ "${NEXT_COMMIT}" == "${MERGE_BASE}" ]]; then
echo "✅ Reached merge base (${MERGE_BASE})"
PR_BASE="${MERGE_BASE}"
elif [[ "${NEXT_COMMIT}" == "${EXPECTED_RELEASE_HEAD}" ]]; then
echo "✅ Reached release branch (${EXPECTED_RELEASE_HEAD})"
PR_BASE="${EXPECTED_RELEASE_HEAD}"
elif [[ -z "${NEXT_COMMIT}" ]]; then
report_error "Unexpected end of commit history before reaching merge base"
fi
# Move to the next commit in the chain
CURRENT_COMMIT="${NEXT_COMMIT}"
done
echo "✅ All commits are properly ordered and linear"
echo "✅ Release PR structure is valid"
echo
message "Commits that are part of this release:
$(git log --oneline "${PR_BASE}..${LINEAR_HEAD}")"

View File

@@ -17,12 +17,6 @@
({};
.[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end))
# Ensure that each component exists, or fail
| (["storage", "compute", "proxy"] - (keys)) as $missing
| if ($missing | length) > 0 then
"Error: Found no release for \($missing | join(", "))!\n" | halt_error(1)
else . end
# Convert the resulting object into an array of formatted strings
| to_entries
| map("\(.key)=\(.value.full)")

View File

@@ -7,8 +7,8 @@ on:
description: 'Component name'
required: true
type: string
source-branch:
description: 'Source branch'
release-branch:
description: 'Release branch'
required: true
type: string
secrets:
@@ -30,25 +30,17 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.source-branch }}
fetch-depth: 0
ref: main
- name: Set variables
id: vars
env:
COMPONENT_NAME: ${{ inputs.component-name }}
RELEASE_BRANCH: >-
${{
false
|| inputs.component-name == 'Storage' && 'release'
|| inputs.component-name == 'Proxy' && 'release-proxy'
|| inputs.component-name == 'Compute' && 'release-compute'
}}
RELEASE_BRANCH: ${{ inputs.release-branch }}
run: |
today=$(date +'%Y-%m-%d')
echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT}
echo "release-branch=${RELEASE_BRANCH}" | tee -a ${GITHUB_OUTPUT}
- name: Configure git
run: |
@@ -57,36 +49,31 @@ jobs:
- name: Create RC branch
env:
RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
TITLE: ${{ steps.vars.outputs.title }}
run: |
git switch -c "${RC_BRANCH}"
git checkout -b "${RC_BRANCH}"
# Manually create a merge commit on the current branch, keeping the
# tree and setting the parents to the current HEAD and the HEAD of the
# release branch. This commit is what we'll fast-forward the release
# branch to when merging the release branch.
# For details on why, look at
# https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs
current_tree=$(git rev-parse 'HEAD^{tree}')
release_head=$(git rev-parse "origin/${RELEASE_BRANCH}")
current_head=$(git rev-parse HEAD)
merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}")
# Fast-forward the current branch to the newly created merge_commit
git merge --ff-only ${merge_commit}
# create an empty commit to distinguish workflow runs
# from other possible releases from the same commit
git commit --allow-empty -m "${TITLE}"
git push origin "${RC_BRANCH}"
- name: Create a PR into ${{ steps.vars.outputs.release-branch }}
- name: Create a PR into ${{ inputs.release-branch }}
env:
GH_TOKEN: ${{ secrets.ci-access-token }}
RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
RELEASE_BRANCH: ${{ inputs.release-branch }}
TITLE: ${{ steps.vars.outputs.title }}
run: |
cat << EOF > body.md
## ${TITLE}
**Please merge this Pull Request using 'Create a merge commit' button**
EOF
gh pr create --title "${TITLE}" \
--body "" \
--body-file "body.md" \
--head "${RC_BRANCH}" \
--base "${RELEASE_BRANCH}"

View File

@@ -19,18 +19,11 @@ on:
description: "Tag of the last compute release"
value: ${{ jobs.tags.outputs.compute }}
run-kind:
description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`"
description: "The kind of run we're currently in. Will be one of `pr`, `push-main`, `storage-rc`, `storage-release`, `proxy-rc`, `proxy-release`, `compute-rc`, `compute-release` or `merge_queue`"
value: ${{ jobs.tags.outputs.run-kind }}
release-pr-run-id:
description: "Only available if `run-kind in [storage-release, proxy-release, compute-release]`. Contains the run ID of the `Build and Test` workflow, assuming one with the current commit can be found."
value: ${{ jobs.tags.outputs.release-pr-run-id }}
permissions: {}
defaults:
run:
shell: bash -euo pipefail {0}
jobs:
tags:
runs-on: ubuntu-22.04
@@ -40,7 +33,6 @@ jobs:
proxy: ${{ steps.previous-releases.outputs.proxy }}
storage: ${{ steps.previous-releases.outputs.storage }}
run-kind: ${{ steps.run-kind.outputs.run-kind }}
release-pr-run-id: ${{ steps.release-pr-run-id.outputs.release-pr-run-id }}
permissions:
contents: read
steps:
@@ -63,7 +55,6 @@ jobs:
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr'
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr'
|| (inputs.github-event-name == 'pull_request') && 'pr'
|| (inputs.github-event-name == 'workflow_dispatch') && 'workflow-dispatch'
|| 'unknown'
}}
run: |
@@ -91,16 +82,9 @@ jobs:
echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
;;
pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr)
BUILD_AND_TEST_RUN_ID=$(gh api --paginate \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=${CURRENT_SHA}&branch=${CURRENT_BRANCH}" \
| jq '[.workflow_runs[] | select(.name == "Build and Test")][0].id // ("Error: No matching workflow run found." | halt_error(1))')
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
;;
workflow-dispatch)
echo "tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT
;;
*)
echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!"
exit 1
@@ -117,13 +101,3 @@ jobs:
"/repos/${GITHUB_REPOSITORY}/releases" \
| jq -f .github/scripts/previous-releases.jq -r \
| tee -a "${GITHUB_OUTPUT}"
- name: Get the release PR run ID
id: release-pr-run-id
if: ${{ contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from RC PR!" | halt_error(1))')
echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT

View File

@@ -476,7 +476,7 @@ jobs:
(
!github.event.pull_request.draft
|| contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft')
|| needs.meta.outputs.run-kind == 'push-main'
|| contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind)
) && !failure() && !cancelled()
}}
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ]
@@ -487,7 +487,7 @@ jobs:
neon-image-arch:
needs: [ check-permissions, build-build-tools-image, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
strategy:
matrix:
arch: [ x64, arm64 ]
@@ -537,7 +537,7 @@ jobs:
neon-image:
needs: [ neon-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ubuntu-22.04
permissions:
id-token: write # aws-actions/configure-aws-credentials
@@ -559,7 +559,7 @@ jobs:
compute-node-image-arch:
needs: [ check-permissions, build-build-tools-image, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -651,7 +651,7 @@ jobs:
compute-node-image:
needs: [ compute-node-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -694,7 +694,7 @@ jobs:
vm-compute-node-image-arch:
needs: [ check-permissions, meta, compute-node-image ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
strategy:
fail-fast: false
@@ -747,7 +747,7 @@ jobs:
vm-compute-node-image:
needs: [ vm-compute-node-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ubuntu-22.04
strategy:
matrix:
@@ -773,12 +773,7 @@ jobs:
test-images:
needs: [ check-permissions, meta, neon-image, compute-node-image ]
# Depends on jobs that can get skipped
if: >-
${{
!failure()
&& !cancelled()
&& contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)
}}
if: "!failure() && !cancelled()"
strategy:
fail-fast: false
matrix:
@@ -805,7 +800,7 @@ jobs:
# Ensure that we don't have bad versions.
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
run: |
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
@@ -826,19 +821,19 @@ jobs:
env:
TAG: >-
${{
needs.meta.outputs.run-kind == 'compute-rc-pr'
contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.previous-storage-release
|| needs.meta.outputs.build-tag
}}
COMPUTE_TAG: >-
${{
contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.previous-compute-release
|| needs.meta.outputs.build-tag
}}
TEST_EXTENSIONS_TAG: >-
${{
contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
&& 'latest'
|| needs.meta.outputs.build-tag
}}
@@ -890,13 +885,7 @@ jobs:
id: generate
run: python3 .github/scripts/generate_image_maps.py
env:
SOURCE_TAG: >-
${{
contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.release-pr-run-id
|| needs.meta.outputs.build-tag
}}
TARGET_TAG: ${{ needs.meta.outputs.build-tag }}
BUILD_TAG: "${{ needs.meta.outputs.build-tag }}"
BRANCH: "${{ github.ref_name }}"
DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
@@ -906,7 +895,7 @@ jobs:
push-neon-image-dev:
needs: [ meta, generate-image-maps, neon-image ]
if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
uses: ./.github/workflows/_push-to-container-registry.yml
permissions:
id-token: write # Required for aws/azure login
@@ -924,7 +913,7 @@ jobs:
push-compute-image-dev:
needs: [ meta, generate-image-maps, vm-compute-node-image ]
if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
uses: ./.github/workflows/_push-to-container-registry.yml
permissions:
id-token: write # Required for aws/azure login
@@ -978,55 +967,16 @@ jobs:
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
secrets: inherit
push-neon-test-extensions-image-ghcr:
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
# This is a bit of a special case so we're not using a generated image map.
add-latest-tag-to-neon-extensions-test-image:
if: github.ref_name == 'main'
needs: [ meta, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [
"ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}"
],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [
"ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}"
]
}
secrets: inherit
add-latest-tag-to-neon-test-extensions-image:
if: ${{ needs.meta.outputs.run-kind == 'push-main' }}
needs: [ meta, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [
"docker.io/neondatabase/neon-test-extensions-v16:latest",
"ghcr.io/neondatabase/neon-test-extensions-v16:latest"
],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [
"docker.io/neondatabase/neon-test-extensions-v17:latest",
"ghcr.io/neondatabase/neon-test-extensions-v17:latest"
]
}
secrets: inherit
add-release-tag-to-neon-test-extensions-image:
if: ${{ needs.meta.outputs.run-kind == 'compute-release' }}
needs: [ meta, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.release-pr-run-id }}": [
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}",
"ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}"
],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.release-pr-run-id }}": [
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}",
"ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}"
]
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
}
secrets: inherit
@@ -1111,7 +1061,7 @@ jobs:
exit 1
deploy:
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
permissions:
@@ -1225,7 +1175,7 @@ jobs:
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \
-f deployStorageBroker=false \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}} \
@@ -1233,7 +1183,7 @@ jobs:
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
-f deployStorage=true \
-f deployStorageBroker=false \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}}
@@ -1281,11 +1231,11 @@ jobs:
payload: |
channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
text: |
🔴 <!subteam^S06CJ87UMNY|@oncall-storage>: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
promote-compatibility-data:
needs: [ meta, deploy ]
needs: [ deploy ]
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -1295,6 +1245,37 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
id: fetch-last-release-pr-info
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
branch_name_and_pr_number=$(gh pr list \
--repo "${GITHUB_REPOSITORY}" \
--base release \
--state merged \
--limit 10 \
--json mergeCommit,headRefName,number \
--jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
run_id=$(gh run list \
--repo "${GITHUB_REPOSITORY}" \
--workflow build_and_test.yml \
--branch "${branch_name}" \
--json databaseId \
--limit 1 \
--jq '.[].databaseId')
last_commit_sha=$(gh pr view "${pr_number}" \
--repo "${GITHUB_REPOSITORY}" \
--json commits \
--jq '.commits[-1].oid')
echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
- uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
@@ -1305,8 +1286,8 @@ jobs:
env:
BUCKET: neon-github-public-dev
AWS_REGION: eu-central-1
COMMIT_SHA: ${{ github.sha }}
RUN_ID: ${{ needs.meta.outputs.release-pr-run-id }}
COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
run: |
old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
new_prefix="artifacts/latest"
@@ -1395,5 +1376,5 @@ jobs:
|| needs.files-changed.result == 'skipped'
|| (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|| (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind))
|| (needs.test-images.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|| needs.test-images.result == 'skipped'
|| (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))

View File

@@ -7,7 +7,7 @@ on:
required: false
type: string
schedule:
- cron: '0 10 * * *'
- cron: '0 0 * * *'
jobs:
cargo-deny:
@@ -50,9 +50,8 @@ jobs:
method: chat.postMessage
token: ${{ secrets.SLACK_BOT_TOKEN }}
payload: |
channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
text: |
Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
Fixing the problem should be fairly straight forward from the logs. If not, <#${{ vars.SLACK_RUST_CHANNEL_ID }}> is there to help.
Pinging <!subteam^S0838JPSH32|@oncall-devprod>.
Pinging @oncall-devprod.

View File

@@ -1,36 +0,0 @@
name: Fast forward merge
on:
pull_request:
types: [labeled]
branches:
- release
- release-proxy
- release-compute
jobs:
fast-forward:
if: ${{ github.event.label.name == 'fast-forward' }}
runs-on: ubuntu-22.04
steps:
- name: Remove fast-forward label to PR
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
gh pr edit ${{ github.event.pull_request.number }} --repo "${GITHUB_REPOSITORY}" --remove-label "fast-forward"
- name: Fast forwarding
uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979
# See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
if: ${{ github.event.pull_request.mergeable_state == 'clean' }}
with:
merge: true
comment: on-error
github_token: ${{ secrets.CI_ACCESS_TOKEN }}
- name: Comment if mergeable_state is not clean
if: ${{ github.event.pull_request.mergeable_state != 'clean' }}
run: |
gh pr comment ${{ github.event.pull_request.number }} \
--repo "${GITHUB_REPOSITORY}" \
--body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`."

View File

@@ -2,8 +2,8 @@ name: large oltp benchmark
on:
# uncomment to run on push for debugging your PR
#push:
# branches: [ bodobolero/synthetic_oltp_workload ]
push:
branches: [ bodobolero/synthetic_oltp_workload ]
schedule:
# * is a special character in YAML so you have to quote this string
@@ -12,7 +12,7 @@ on:
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 15 * * 0,2,4' # run on Sunday, Tuesday, Thursday at 3 PM UTC
- cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
workflow_dispatch: # adds ability to run this manually
defaults:
@@ -22,7 +22,7 @@ defaults:
concurrency:
# Allow only one workflow globally because we need dedicated resources which only exist once
group: large-oltp-bench-workflow
cancel-in-progress: false
cancel-in-progress: true
jobs:
oltp:
@@ -31,9 +31,9 @@ jobs:
matrix:
include:
- target: new_branch
custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
- target: reuse_branch
custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
permissions:
contents: write
@@ -46,6 +46,7 @@ jobs:
PG_VERSION: 16 # pre-determined by pre-determined project
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
PLATFORM: ${{ matrix.target }}
runs-on: [ self-hosted, us-east-2, x64 ]
@@ -56,10 +57,8 @@ jobs:
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
# Increase timeout to 2 days, default timeout is 6h - database maintenance can take a long time
# (normally 1h pgbench, 3h vacuum analyze 3.5h re-index) x 2 = 15h, leave some buffer for regressions
# in one run vacuum didn't finish within 12 hours
timeout-minutes: 2880
# Increase timeout to 8h, default timeout is 6h
timeout-minutes: 480
steps:
- uses: actions/checkout@v4
@@ -90,45 +89,29 @@ jobs:
- name: Set up Connection String
id: set-up-connstr
run: |
case "${{ matrix.target }}" in
new_branch)
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
;;
reuse_branch)
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
;;
*)
echo >&2 "Unknown target=${{ matrix.target }}"
exit 1
;;
esac
case "${{ matrix.target }}" in
new_branch)
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
;;
reuse_branch)
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
;;
*)
echo >&2 "Unknown target=${{ matrix.target }}"
exit 1
;;
esac
CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}"
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT
- name: Delete rows from prior runs in reuse branch
if: ${{ matrix.target == 'reuse_branch' }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }}
PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config
PSQL: /tmp/neon/pg_install/v16/bin/psql
PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib
run: |
echo "$(date '+%Y-%m-%d %H:%M:%S') - Deleting rows in table webhook.incoming_webhooks from prior runs"
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
${PSQL} "${BENCHMARK_CONNSTR}" -c "SET statement_timeout = 0; DELETE FROM webhook.incoming_webhooks WHERE created_at > '2025-02-27 23:59:59+00';"
echo "$(date '+%Y-%m-%d %H:%M:%S') - Finished deleting rows in table webhook.incoming_webhooks from prior runs"
- name: Benchmark pgbench with custom-scripts
- name: Benchmark pgbench with custom-scripts
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: true
extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_pgbench
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
pg_version: ${{ env.PG_VERSION }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
@@ -136,21 +119,6 @@ jobs:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Benchmark database maintenance
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: true
extra_params: -m remote_cluster --timeout 172800 -k test_perf_oltp_large_tenant_maintenance
pg_version: ${{ env.PG_VERSION }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Delete Neon Branch for large tenant
if: ${{ always() && matrix.target == 'new_branch' }}
uses: ./.github/actions/neon-branch-delete
@@ -159,13 +127,6 @@ jobs:
branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Configure AWS credentials # again because prior steps could have exceeded 5 hours
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}

View File

@@ -1,24 +0,0 @@
name: Lint Release PR
on:
pull_request:
branches:
- release
- release-proxy
- release-compute
jobs:
lint-release-pr:
runs-on: ubuntu-22.04
steps:
- name: Checkout PR branch
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch full history for git operations
ref: ${{ github.event.pull_request.head.ref }}
- name: Run lint script
env:
RELEASE_BRANCH: ${{ github.base_ref }}
run: |
./.github/scripts/lint-release-pr.sh

View File

@@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# ┌───────────── hour (0 - 23)
# │ ┌───────────── day of the month (1 - 31)
# │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 */3 * * *' # Runs every 3 hours
# ┌───────────── minute (0 - 59)
# ┌───────────── hour (0 - 23)
# │ ┌───────────── day of the month (1 - 31)
# │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
workflow_dispatch: # Allows manual triggering of the workflow
inputs:
commit_hash:

View File

@@ -8,6 +8,8 @@ on:
- .github/workflows/build-build-tools-image.yml
- .github/workflows/pre-merge-checks.yml
merge_group:
branches:
- main
defaults:
run:
@@ -17,17 +19,15 @@ defaults:
permissions: {}
jobs:
meta:
get-changed-files:
runs-on: ubuntu-22.04
outputs:
python-changed: ${{ steps.python-src.outputs.any_changed }}
rust-changed: ${{ steps.rust-src.outputs.any_changed }}
branch: ${{ steps.group-metadata.outputs.branch }}
pr-number: ${{ steps.group-metadata.outputs.pr-number }}
steps:
- uses: actions/checkout@v4
- uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
id: python-src
with:
files: |
@@ -38,7 +38,7 @@ jobs:
poetry.lock
pyproject.toml
- uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
id: rust-src
with:
files: |
@@ -58,20 +58,12 @@ jobs:
echo "${PYTHON_CHANGED_FILES}"
echo "${RUST_CHANGED_FILES}"
- name: Merge group metadata
if: ${{ github.event_name == 'merge_group' }}
id: group-metadata
env:
MERGE_QUEUE_REF: ${{ github.event.merge_group.head_ref }}
run: |
echo $MERGE_QUEUE_REF | jq -Rr 'capture("refs/heads/gh-readonly-queue/(?<branch>.*)/pr-(?<pr_number>[0-9]+)-[0-9a-f]{40}") | ["branch=" + .branch, "pr-number=" + .pr_number] | .[]' | tee -a "${GITHUB_OUTPUT}"
build-build-tools-image:
if: |
false
|| needs.meta.outputs.python-changed == 'true'
|| needs.meta.outputs.rust-changed == 'true'
needs: [ meta ]
|| needs.get-changed-files.outputs.python-changed == 'true'
|| needs.get-changed-files.outputs.rust-changed == 'true'
needs: [ get-changed-files ]
uses: ./.github/workflows/build-build-tools-image.yml
with:
# Build only one combination to save time
@@ -80,8 +72,8 @@ jobs:
secrets: inherit
check-codestyle-python:
if: needs.meta.outputs.python-changed == 'true'
needs: [ meta, build-build-tools-image ]
if: needs.get-changed-files.outputs.python-changed == 'true'
needs: [ get-changed-files, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-python.yml
with:
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
@@ -89,8 +81,8 @@ jobs:
secrets: inherit
check-codestyle-rust:
if: needs.meta.outputs.rust-changed == 'true'
needs: [ meta, build-build-tools-image ]
if: needs.get-changed-files.outputs.rust-changed == 'true'
needs: [ get-changed-files, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-rust.yml
with:
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
@@ -109,7 +101,7 @@ jobs:
statuses: write # for `github.repos.createCommitStatus(...)`
contents: write
needs:
- meta
- get-changed-files
- check-codestyle-python
- check-codestyle-rust
runs-on: ubuntu-22.04
@@ -137,20 +129,7 @@ jobs:
run: exit 1
if: |
false
|| (github.event_name == 'merge_group' && needs.meta.outputs.branch != 'main')
|| (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.python-changed == 'true')
|| (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.rust-changed == 'true')
|| (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
|| (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true')
|| contains(needs.*.result, 'failure')
|| contains(needs.*.result, 'cancelled')
- name: Add fast-forward label to PR to trigger fast-forward merge
if: >-
${{
always()
&& github.event_name == 'merge_group'
&& contains(fromJson('["release", "release-proxy", "release-compute"]'), needs.meta.outputs.branch)
}}
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: >-
gh pr edit ${{ needs.meta.outputs.pr-number }} --repo "${GITHUB_REPOSITORY}" --add-label "fast-forward"

View File

@@ -38,7 +38,7 @@ jobs:
uses: ./.github/workflows/_create-release-pr.yml
with:
component-name: 'Storage'
source-branch: ${{ github.ref_name }}
release-branch: 'release'
secrets:
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
uses: ./.github/workflows/_create-release-pr.yml
with:
component-name: 'Proxy'
source-branch: ${{ github.ref_name }}
release-branch: 'release-proxy'
secrets:
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -64,6 +64,6 @@ jobs:
uses: ./.github/workflows/_create-release-pr.yml
with:
component-name: 'Compute'
source-branch: ${{ github.ref_name }}
release-branch: 'release-compute'
secrets:
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}

View File

@@ -1,8 +1,8 @@
# Autoscaling
/libs/vm_monitor/ @neondatabase/autoscaling
# DevProd & PerfCorr
/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness
# DevProd
/.github/ @neondatabase/developer-productivity
# Compute
/pgxn/ @neondatabase/compute

429
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -106,13 +106,13 @@ hostname = "0.4"
http = {version = "1.1.0", features = ["std"]}
http-types = { version = "2", default-features = false }
http-body-util = "0.1.2"
humantime = "2.2"
humantime = "2.1"
humantime-serde = "1.1.1"
hyper0 = { package = "hyper", version = "0.14" }
hyper = "1.4"
hyper-util = "0.1"
tokio-tungstenite = "0.21.0"
indexmap = { version = "2", features = ["serde"] }
indexmap = "2"
indoc = "2"
ipnet = "2.10.0"
itertools = "0.10"
@@ -126,9 +126,7 @@ measured = { version = "0.0.22", features=["lasso"] }
measured-process = { version = "0.0.22" }
memoffset = "0.9"
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
# Do not update to >= 7.0.0, at least. The update will have a significant impact
# on compute startup metrics (start_postgres_ms), >= 25% degradation.
notify = "6.0.0"
notify = "8.0.0"
num_cpus = "1.15"
num-traits = "0.2.15"
once_cell = "1.13"
@@ -141,7 +139,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] }
procfs = "0.16"
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
prost = "0.13"
@@ -157,7 +155,6 @@ rpds = "0.13"
rustc-hash = "1.1.0"
rustls = { version = "0.23.16", default-features = false }
rustls-pemfile = "2"
rustls-pki-types = "1.11"
scopeguard = "1.1"
sysinfo = "0.29.2"
sd-notify = "0.4.1"
@@ -221,7 +218,7 @@ zerocopy = { version = "0.7", features = ["derive"] }
json-structural-diff = { version = "0.2.0" }
## TODO replace this with tracing
env_logger = "0.11"
env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed

View File

@@ -1484,7 +1484,7 @@ WORKDIR /ext-src
COPY compute/patches/pg_duckdb_v031.patch .
COPY compute/patches/duckdb_v120.patch .
# pg_duckdb build requires source dir to be a git repo to get submodules
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# - extension management function duckdb.install_extension()
# - access to duckdb.extensions table and its sequence
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
@@ -1499,8 +1499,8 @@ ARG PG_VERSION
COPY --from=pg_duckdb-src /ext-src/ /ext-src/
WORKDIR /ext-src/pg_duckdb-src
RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
#########################################################################################
#
# Layer "pg_repack"
@@ -1735,8 +1735,6 @@ RUN set -e \
libevent-dev \
libtool \
pkg-config \
libcurl4-openssl-dev \
libssl-dev \
&& apt clean && rm -rf /var/lib/apt/lists/*
# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
@@ -1745,7 +1743,7 @@ RUN set -e \
&& git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
&& cd pgbouncer \
&& ./autogen.sh \
&& ./configure --prefix=/usr/local/pgbouncer \
&& ./configure --prefix=/usr/local/pgbouncer --without-openssl \
&& make -j $(nproc) dist_man_MANS= \
&& make install dist_man_MANS=
@@ -1760,15 +1758,15 @@ ARG TARGETARCH
# test_runner/regress/test_compute_metrics.py
# See comment on the top of the file regading `echo`, `-e` and `\n`
RUN if [ "$TARGETARCH" = "amd64" ]; then\
postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
else\
postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\
pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
fi\
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\
| tar xzf - --strip-components=1 -C.\
&& curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
| tar xzf - --strip-components=1 -C.\
@@ -1982,10 +1980,12 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
# rsyslog config permissions
# directory for rsyslogd pid file
RUN mkdir /var/run/rsyslogd && \
chown -R postgres:postgres /var/run/rsyslogd && \
chown -R postgres:postgres /etc/rsyslog.d/
RUN chown postgres:postgres /etc/rsyslog.conf && \
touch /etc/compute_rsyslog.conf && \
chown -R postgres:postgres /etc/compute_rsyslog.conf && \
# directory for rsyslogd pid file
mkdir /var/run/rsyslogd && \
chown -R postgres:postgres /var/run/rsyslogd
ENV LANG=en_US.utf8

View File

@@ -29,7 +29,6 @@
import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
import 'sql_exporter/lfc_cache_size_limit.libsonnet',
import 'sql_exporter/lfc_chunk_size.libsonnet',
import 'sql_exporter/lfc_hits.libsonnet',
import 'sql_exporter/lfc_misses.libsonnet',
import 'sql_exporter/lfc_used.libsonnet',

View File

@@ -1,5 +1 @@
SELECT sum(pg_database_size(datname)) AS total
FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2;
SELECT sum(pg_database_size(datname)) AS total FROM pg_database;

View File

@@ -1,10 +0,0 @@
{
metric_name: 'lfc_chunk_size',
type: 'gauge',
help: 'LFC chunk size, measured in 8KiB pages',
key_labels: null,
values: [
'lfc_chunk_size_pages',
],
query: importstr 'sql_exporter/lfc_chunk_size.sql',
}

View File

@@ -1 +0,0 @@
SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages';

View File

@@ -1,20 +1,10 @@
-- We export stats for 10 non-system databases. Without this limit it is too
-- easy to abuse the system by creating lots of databases.
SELECT pg_database_size(datname) AS db_size,
deadlocks,
tup_inserted AS inserted,
tup_updated AS updated,
tup_deleted AS deleted,
datname
SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
tup_updated AS updated, tup_deleted AS deleted, datname
FROM pg_stat_database
WHERE datname IN (
SELECT datname FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2
AND datname <> 'postgres'
AND NOT datistemplate
ORDER BY oid
LIMIT 10
WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
);

View File

@@ -0,0 +1,265 @@
commit 00aa659afc9c7336ab81036edec3017168aabf40
Author: Heikki Linnakangas <heikki@neon.tech>
Date: Tue Nov 12 16:59:19 2024 +0200
Temporarily disable test that depends on timezone
diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
index 23ef5fa..9e60deb 100644
--- a/ext-src/pg_anon-src/tests/expected/generalization.out
+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
(1 row)
-SELECT anon.generalize_tstzrange('19041107','millennium');
- generalize_tstzrange
------------------------------------------------------------------
- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
-(1 row)
-
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
-- generalize_daterange
SELECT anon.generalize_daterange('19041107');
generalize_daterange
diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
index b868344..b4fc977 100644
--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
SELECT anon.generalize_tstzrange('19041107','year');
SELECT anon.generalize_tstzrange('19041107','decade');
SELECT anon.generalize_tstzrange('19041107','century');
-SELECT anon.generalize_tstzrange('19041107','millennium');
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
-- generalize_daterange
SELECT anon.generalize_daterange('19041107');
commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
Author: Alexey Masterov <alexeymasterov@neon.tech>
Date: Fri May 31 06:34:26 2024 +0000
These alternative expected files were added to consider the neon features
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
new file mode 100644
index 0000000..2539cfd
--- /dev/null
+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
@@ -0,0 +1,101 @@
+BEGIN;
+CREATE EXTENSION anon CASCADE;
+NOTICE: installing required extension "pgcrypto"
+SELECT anon.init();
+ init
+------
+ t
+(1 row)
+
+CREATE ROLE mallory_the_masked_user;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
+CREATE TABLE t1(i INT);
+ALTER TABLE t1 ADD COLUMN t TEXT;
+SECURITY LABEL FOR anon ON COLUMN t1.t
+IS 'MASKED WITH VALUE NULL';
+INSERT INTO t1 VALUES (1,'test');
+--
+-- We're checking the owner's permissions
+--
+-- see
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
+--
+SET ROLE mallory_the_masked_user;
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.init();
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.anonymize_table('t1');
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+SAVEPOINT fail_start_engine;
+SELECT anon.start_dynamic_masking();
+ERROR: Only supersusers can start the dynamic masking engine.
+CONTEXT: PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
+ROLLBACK TO fail_start_engine;
+RESET ROLE;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+SET ROLE mallory_the_masked_user;
+SELECT * FROM mask.t1;
+ i | t
+---+---
+ 1 |
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ SELECT * FROM public.t1;
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+SAVEPOINT fail_stop_engine;
+SELECT anon.stop_dynamic_masking();
+ERROR: Only supersusers can stop the dynamic masking engine.
+CONTEXT: PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
+ROLLBACK TO fail_stop_engine;
+RESET ROLE;
+SELECT anon.stop_dynamic_masking();
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
+ stop_dynamic_masking
+----------------------
+ t
+(1 row)
+
+SET ROLE mallory_the_masked_user;
+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+SAVEPOINT fail_seclabel_on_role;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
+ERROR: permission denied
+DETAIL: The current user must have the CREATEROLE attribute.
+ROLLBACK TO fail_seclabel_on_role;
+ROLLBACK;
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
new file mode 100644
index 0000000..8b090fe
--- /dev/null
+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
@@ -0,0 +1,104 @@
+BEGIN;
+CREATE EXTENSION anon CASCADE;
+NOTICE: installing required extension "pgcrypto"
+SELECT anon.init();
+ init
+------
+ t
+(1 row)
+
+CREATE ROLE oscar_the_owner;
+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
+CREATE ROLE mallory_the_masked_user;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
+--
+-- We're checking the owner's permissions
+--
+-- see
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
+--
+SET ROLE oscar_the_owner;
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.init();
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+CREATE TABLE t1(i INT);
+ALTER TABLE t1 ADD COLUMN t TEXT;
+SECURITY LABEL FOR anon ON COLUMN t1.t
+IS 'MASKED WITH VALUE NULL';
+INSERT INTO t1 VALUES (1,'test');
+SELECT anon.anonymize_table('t1');
+ anonymize_table
+-----------------
+ t
+(1 row)
+
+SELECT * FROM t1;
+ i | t
+---+---
+ 1 |
+(1 row)
+
+UPDATE t1 SET t='test' WHERE i=1;
+-- SHOULD FAIL
+SAVEPOINT fail_start_engine;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+ROLLBACK TO fail_start_engine;
+RESET ROLE;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+SET ROLE oscar_the_owner;
+SELECT * FROM t1;
+ i | t
+---+------
+ 1 | test
+(1 row)
+
+--SELECT * FROM mask.t1;
+-- SHOULD FAIL
+SAVEPOINT fail_stop_engine;
+SELECT anon.stop_dynamic_masking();
+ERROR: permission denied for schema mask
+CONTEXT: SQL statement "DROP VIEW mask.t1;"
+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
+SQL statement "SELECT anon.mask_drop_view(oid)
+ FROM pg_catalog.pg_class
+ WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
+ AND relkind IN ('r','p','f')"
+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
+ROLLBACK TO fail_stop_engine;
+RESET ROLE;
+SELECT anon.stop_dynamic_masking();
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
+ stop_dynamic_masking
+----------------------
+ t
+(1 row)
+
+SET ROLE oscar_the_owner;
+-- SHOULD FAIL
+SAVEPOINT fail_seclabel_on_role;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
+ERROR: permission denied
+DETAIL: The current user must have the CREATEROLE attribute.
+ROLLBACK TO fail_seclabel_on_role;
+ROLLBACK;

View File

@@ -39,17 +39,10 @@ commands:
user: nobody
sysvInitAction: respawn
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
# Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also.
# We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to
# use a different path for the socket. The symlink actually points to our custom path.
- name: rsyslogd-socket-symlink
user: root
sysvInitAction: sysinit
shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log"
- name: rsyslogd
user: postgres
sysvInitAction: respawn
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shell: '/usr/sbin/rsyslogd -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
@@ -84,11 +77,7 @@ files:
# compute_ctl will rewrite this file with the actual configuration, if needed.
- filename: compute_rsyslog.conf
content: |
# Syslock.Name specifies a non-default pipe location that is writeable for the postgres user.
module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging
*.* /dev/null
$IncludeConfig /etc/rsyslog.d/*.conf
build: |
# Build cgroup-tools
#
@@ -152,11 +141,9 @@ merge: |
RUN set -e \
&& chmod 0644 /etc/cgconfig.conf
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
RUN set -e \
&& chmod 0644 /etc/compute_rsyslog.conf
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/

View File

@@ -39,17 +39,10 @@ commands:
user: nobody
sysvInitAction: respawn
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
# Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also.
# We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to
# use a different path for the socket. The symlink actually points to our custom path.
- name: rsyslogd-socket-symlink
user: root
sysvInitAction: sysinit
shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log"
- name: rsyslogd
user: postgres
sysvInitAction: respawn
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shell: '/usr/sbin/rsyslogd -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
@@ -84,11 +77,7 @@ files:
# compute_ctl will rewrite this file with the actual configuration, if needed.
- filename: compute_rsyslog.conf
content: |
# Syslock.Name specifies a non-default pipe location that is writeable for the postgres user.
module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging
*.* /dev/null
$IncludeConfig /etc/rsyslog.d/*.conf
build: |
# Build cgroup-tools
#
@@ -149,9 +138,8 @@ merge: |
&& chmod 0644 /etc/cgconfig.conf
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
RUN set -e \
&& chmod 0644 /etc/compute_rsyslog.conf
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/

View File

@@ -26,7 +26,6 @@ fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
indexmap.workspace = true
jsonwebtoken.workspace = true
metrics.workspace = true
nix.workspace = true
@@ -35,19 +34,16 @@ num_cpus.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true
opentelemetry_sdk.workspace = true
p256 = { version = "0.13", features = ["pem"] }
postgres.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["json"] }
ring = "0.17"
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
spki = { version = "0.7.3", features = ["std"] }
tar.workspace = true
tower.workspace = true
tower-http.workspace = true
reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-util.workspace = true
@@ -61,7 +57,6 @@ thiserror.workspace = true
url.workspace = true
uuid.workspace = true
walkdir.workspace = true
x509-cert = { version = "0.2.5" }
postgres_initdb.workspace = true
compute_api.workspace = true

View File

@@ -37,14 +37,10 @@ use crate::logger::startup_context_from_env;
use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
use crate::monitor::launch_monitor;
use crate::pg_helpers::*;
use crate::rsyslog::{
PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export,
launch_pgaudit_gc,
};
use crate::rsyslog::configure_and_start_rsyslog;
use crate::spec::*;
use crate::swap::resize_swap;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
use crate::tls::watch_cert_for_changes;
use crate::{config, extension_server, local_proxy};
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
@@ -116,7 +112,6 @@ pub struct ComputeNode {
// key: ext_archive_name, value: started download time, download_completed?
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
pub compute_ctl_config: ComputeCtlConfig,
}
// store some metrics about download size that might impact startup time
@@ -140,6 +135,8 @@ pub struct ComputeState {
/// passed by the control plane with a /configure HTTP request.
pub pspec: Option<ParsedSpec>,
pub compute_ctl_config: ComputeCtlConfig,
/// If the spec is passed by a /configure request, 'startup_span' is the
/// /configure request's tracing span. The main thread enters it when it
/// processes the compute startup, so that the compute startup is considered
@@ -163,6 +160,7 @@ impl ComputeState {
last_active: None,
error: None,
pspec: None,
compute_ctl_config: ComputeCtlConfig::default(),
startup_span: None,
metrics: ComputeMetrics::default(),
}
@@ -299,6 +297,79 @@ struct StartVmMonitorResult {
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String {
let roles = spec
.cluster
.roles
.iter()
.map(|r| escape_literal(&r.name))
.collect::<Vec<_>>();
let dbs = spec
.cluster
.databases
.iter()
.map(|db| escape_literal(&db.name))
.collect::<Vec<_>>();
let roles_decl = if roles.is_empty() {
String::from("roles text[] := NULL;")
} else {
format!(
r#"
roles text[] := ARRAY(SELECT rolname
FROM pg_catalog.pg_roles
WHERE rolname IN ({}));"#,
roles.join(", ")
)
};
let database_decl = if dbs.is_empty() {
String::from("dbs text[] := NULL;")
} else {
format!(
r#"
dbs text[] := ARRAY(SELECT datname
FROM pg_catalog.pg_database
WHERE datname IN ({}));"#,
dbs.join(", ")
)
};
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
let query = format!(
r#"
DO $$
DECLARE
r text;
{}
{}
BEGIN
IF NOT EXISTS (
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
IF array_length(roles, 1) IS NOT NULL THEN
EXECUTE format('GRANT neon_superuser TO %s',
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
FOREACH r IN ARRAY roles LOOP
EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
END LOOP;
END IF;
IF array_length(dbs, 1) IS NOT NULL THEN
EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
END IF;
END IF;
END
$$;"#,
roles_decl, database_decl,
);
query
}
impl ComputeNode {
pub fn new(
params: ComputeNodeParams,
@@ -316,6 +387,7 @@ impl ComputeNode {
let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
new_state.pspec = Some(pspec);
}
new_state.compute_ctl_config = compute_ctl_config;
Ok(ComputeNode {
params,
@@ -324,7 +396,6 @@ impl ComputeNode {
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_download_progress: RwLock::new(HashMap::new()),
compute_ctl_config,
})
}
@@ -347,7 +418,7 @@ impl ComputeNode {
// requests while configuration is still in progress.
crate::http::server::Server::External {
port: this.params.external_http_port,
config: this.compute_ctl_config.clone(),
jwks: this.state.lock().unwrap().compute_ctl_config.jwks.clone(),
compute_id: this.params.compute_id.clone(),
}
.launch(&this);
@@ -526,16 +597,6 @@ impl ComputeNode {
// Collect all the tasks that must finish here
let mut pre_tasks = tokio::task::JoinSet::new();
// Make sure TLS certificates are properly loaded and in the right place.
if self.compute_ctl_config.tls.is_some() {
let this = self.clone();
pre_tasks.spawn(async move {
this.watch_cert_for_changes().await;
Ok::<(), anyhow::Error>(())
});
}
// If there are any remote extensions in shared_preload_libraries, start downloading them
if pspec.spec.remote_extensions.is_some() {
let (this, spec) = (self.clone(), pspec.spec.clone());
@@ -591,13 +652,11 @@ impl ComputeNode {
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
info!("tuning pgbouncer");
let pgbouncer_settings = pgbouncer_settings.clone();
let tls_config = self.compute_ctl_config.tls.clone();
// Spawn a background task to do the tuning,
// so that we don't block the main thread that starts Postgres.
let pgbouncer_settings = pgbouncer_settings.clone();
let _handle = tokio::spawn(async move {
let res = tune_pgbouncer(pgbouncer_settings, tls_config).await;
let res = tune_pgbouncer(pgbouncer_settings).await;
if let Err(err) = res {
error!("error while tuning pgbouncer: {err:?}");
// Continue with the startup anyway
@@ -620,7 +679,7 @@ impl ComputeNode {
});
}
// Configure and start rsyslog for HIPAA if necessary
// Configure and start rsyslog if necessary
if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
if remote_endpoint.is_empty() {
@@ -628,22 +687,13 @@ impl ComputeNode {
}
let log_directory_path = Path::new(&self.params.pgdata).join("log");
let log_directory_path = log_directory_path.to_string_lossy().to_string();
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
// Launch a background task to clean up the audit logs
launch_pgaudit_gc(log_directory_path);
}
// Configure and start rsyslog for Postgres logs export
if self.has_feature(ComputeFeature::PostgresLogsExport) {
if let Some(ref project_id) = pspec.spec.cluster.cluster_id {
let host = PostgresLogsRsyslogConfig::default_host(project_id);
let conf = PostgresLogsRsyslogConfig::new(Some(&host));
configure_postgres_logs_export(conf)?;
} else {
warn!("not configuring rsyslog for Postgres logs export: project ID is missing")
}
// TODO: make this more robust
// now rsyslog starts once and there is no monitoring or restart if it fails
configure_and_start_rsyslog(
log_directory_path.to_str().unwrap(),
"hipaa",
&remote_endpoint,
)?;
}
// Launch remaining service threads
@@ -668,9 +718,9 @@ impl ComputeNode {
if pspec.spec.mode == ComputeMode::Primary {
self.configure_as_primary(&compute_state)?;
let conf = self.get_tokio_conn_conf(None);
tokio::task::spawn(async {
let res = get_installed_extensions(conf).await;
let conf = self.get_conn_conf(None);
tokio::task::spawn_blocking(|| {
let res = get_installed_extensions(conf);
match res {
Ok(extensions) => {
info!(
@@ -1128,10 +1178,9 @@ impl ComputeNode {
// Remove/create an empty pgdata directory and put configuration there.
self.create_pgdata()?;
config::write_postgres_conf(
pgdata_path,
&pgdata_path.join("postgresql.conf"),
&pspec.spec,
self.params.internal_http_port,
&self.compute_ctl_config.tls,
)?;
// Syncing safekeepers is only safe with primary nodes: if a primary
@@ -1513,13 +1562,11 @@ impl ComputeNode {
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
info!("tuning pgbouncer");
let pgbouncer_settings = pgbouncer_settings.clone();
let tls_config = self.compute_ctl_config.tls.clone();
// Spawn a background task to do the tuning,
// so that we don't block the main thread that starts Postgres.
let pgbouncer_settings = pgbouncer_settings.clone();
tokio::spawn(async move {
let res = tune_pgbouncer(pgbouncer_settings, tls_config).await;
let res = tune_pgbouncer(pgbouncer_settings).await;
if let Err(err) = res {
error!("error while tuning pgbouncer: {err:?}");
}
@@ -1531,8 +1578,7 @@ impl ComputeNode {
// Spawn a background task to do the configuration,
// so that we don't block the main thread that starts Postgres.
let mut local_proxy = local_proxy.clone();
local_proxy.tls = self.compute_ctl_config.tls.clone();
let local_proxy = local_proxy.clone();
tokio::spawn(async move {
if let Err(err) = local_proxy::configure(&local_proxy) {
error!("error while configuring local_proxy: {err:?}");
@@ -1542,12 +1588,8 @@ impl ComputeNode {
// Write new config
let pgdata_path = Path::new(&self.params.pgdata);
config::write_postgres_conf(
pgdata_path,
&spec,
self.params.internal_http_port,
&self.compute_ctl_config.tls,
)?;
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
config::write_postgres_conf(&postgresql_conf_path, &spec, self.params.internal_http_port)?;
if !spec.skip_pg_catalog_updates {
let max_concurrent_connections = spec.reconfigure_concurrency;
@@ -1618,56 +1660,6 @@ impl ComputeNode {
Ok(())
}
pub async fn watch_cert_for_changes(self: Arc<Self>) {
// update status on cert renewal
if let Some(tls_config) = &self.compute_ctl_config.tls {
let tls_config = tls_config.clone();
// wait until the cert exists.
let mut cert_watch = watch_cert_for_changes(tls_config.cert_path.clone()).await;
tokio::task::spawn_blocking(move || {
let handle = tokio::runtime::Handle::current();
'cert_update: loop {
// let postgres/pgbouncer/local_proxy know the new cert/key exists.
// we need to wait until it's configurable first.
let mut state = self.state.lock().unwrap();
'status_update: loop {
match state.status {
// let's update the state to config pending
ComputeStatus::ConfigurationPending | ComputeStatus::Running => {
state.set_status(
ComputeStatus::ConfigurationPending,
&self.state_changed,
);
break 'status_update;
}
// exit loop
ComputeStatus::Failed
| ComputeStatus::TerminationPending
| ComputeStatus::Terminated => break 'cert_update,
// wait
ComputeStatus::Init
| ComputeStatus::Configuration
| ComputeStatus::Empty => {
state = self.state_changed.wait(state).unwrap();
}
}
}
drop(state);
// wait for a new certificate update
if handle.block_on(cert_watch.changed()).is_err() {
break;
}
}
});
}
}
/// Update the `last_active` in the shared state, but ensure that it's a more recent one.
pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
let mut state = self.state.lock().unwrap();

View File

@@ -6,13 +6,11 @@ use std::io::Write;
use std::io::prelude::*;
use std::path::Path;
use compute_api::responses::TlsConfig;
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption};
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
use crate::pg_helpers::{
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
};
use crate::tls::{self, SERVER_CRT, SERVER_KEY};
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
@@ -40,12 +38,10 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
/// Create or completely rewrite configuration file specified by `path`
pub fn write_postgres_conf(
pgdata_path: &Path,
path: &Path,
spec: &ComputeSpec,
extension_server_port: u16,
tls_config: &Option<TlsConfig>,
) -> Result<()> {
let path = pgdata_path.join("postgresql.conf");
// File::create() destroys the file content if it exists.
let mut file = File::create(path)?;
@@ -90,20 +86,6 @@ pub fn write_postgres_conf(
)?;
}
// tls
if let Some(tls_config) = tls_config {
writeln!(file, "ssl = on")?;
// postgres requires the keyfile to be in a secure file,
// currently too complicated to ensure that at the VM level,
// so we just copy them to another file instead. :shrug:
tls::update_key_path_blocking(pgdata_path, tls_config);
// these are the default, but good to be explicit.
writeln!(file, "ssl_cert_file = '{}'", SERVER_CRT)?;
writeln!(file, "ssl_key_file = '{}'", SERVER_KEY)?;
}
// Locales
if cfg!(target_os = "macos") {
writeln!(file, "lc_messages='C'")?;
@@ -167,8 +149,7 @@ pub fn write_postgres_conf(
writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
// This log level is very verbose
// but this is necessary for HIPAA compliance.
// Exclude 'misc' category, because it doesn't contain anythig relevant.
writeln!(file, "pgaudit.log='all, -misc'")?;
writeln!(file, "pgaudit.log='all'")?;
writeln!(file, "pgaudit.log_parameter=on")?;
// Disable logging of catalog queries
// The catalog doesn't contain sensitive data, so we don't need to audit it.
@@ -216,12 +197,6 @@ pub fn write_postgres_conf(
writeln!(file, "neon.disable_logical_replication_subscribers=false")?;
}
// We need Postgres to send logs to rsyslog so that we can forward them
// further to customers' log aggregation systems.
if spec.features.contains(&ComputeFeature::PostgresLogsExport) {
writeln!(file, "log_destination='stderr,syslog'")?;
}
// This is essential to keep this line at the end of the file,
// because it is intended to override any settings above.
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;

View File

@@ -1,10 +0,0 @@
# Program name comes from postgres' syslog_facility configuration: https://www.postgresql.org/docs/current/runtime-config-logging.html#GUC-SYSLOG-IDENT
# Default value is 'postgres'.
if $programname == 'postgres' then {{
# Forward Postgres logs to telemetry otel collector
action(type="omfwd" target="{logs_export_target}" port="{logs_export_port}" protocol="tcp"
template="RSYSLOG_SyslogProtocol23Format"
action.resumeRetryCount="3"
queue.type="linkedList" queue.size="1000")
stop
}}

View File

@@ -4,8 +4,7 @@ module(load="imfile")
# Input configuration for log files in the specified directory
# Replace {log_directory} with the directory containing the log files
input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
# the directory to store rsyslog state files
global(workDirectory="/var/log/rsyslog")
global(workDirectory="/var/log")
# Forward logs to remote syslog server
*.* @@{remote_endpoint}
*.* @@{remote_endpoint}

View File

@@ -202,24 +202,8 @@ pub async fn download_extension(
// move contents of the libdir / sharedir in unzipped archive to the correct local paths
for paths in [sharedir_paths, libdir_paths] {
let (zip_dir, real_dir) = paths;
let dir = match std::fs::read_dir(&zip_dir) {
Ok(dir) => dir,
Err(e) => match e.kind() {
// In the event of a SQL-only extension, there would be nothing
// to move from the lib/ directory, so note that in the log and
// move on.
std::io::ErrorKind::NotFound => {
info!("nothing to move from {}", zip_dir);
continue;
}
_ => return Err(anyhow::anyhow!(e)),
},
};
info!("mv {zip_dir:?}/* {real_dir:?}");
for file in dir {
for file in std::fs::read_dir(zip_dir)? {
let old_file = file?.path();
let new_file =
Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
@@ -269,31 +253,27 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
}
}
// Do request to extension storage proxy, e.g.,
// Do request to extension storage proxy, i.e.
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
// using HTTP GET and return the response body as bytes.
// using HHTP GET
// and return the response body as bytes
//
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
let uri = format!("{}/{}", ext_remote_storage, ext_path);
let filename = Path::new(ext_path)
.file_name()
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
.to_str()
.unwrap_or("unknown")
.to_string();
info!("Downloading extension file '{}' from uri {}", filename, uri);
info!("Download extension {} from uri {}", ext_path, uri);
match do_extension_server_request(&uri).await {
Ok(resp) => {
info!("Successfully downloaded remote extension data {}", ext_path);
REMOTE_EXT_REQUESTS_TOTAL
.with_label_values(&[&StatusCode::OK.to_string(), &filename])
.with_label_values(&[&StatusCode::OK.to_string()])
.inc();
Ok(resp)
}
Err((msg, status)) => {
REMOTE_EXT_REQUESTS_TOTAL
.with_label_values(&[&status, &filename])
.with_label_values(&[&status])
.inc();
bail!(msg);
}

View File

@@ -1,2 +1 @@
pub(in crate::http) mod authorize;
pub(in crate::http) mod request_id;

View File

@@ -1,16 +0,0 @@
use axum::{extract::Request, middleware::Next, response::Response};
use uuid::Uuid;
use crate::http::headers::X_REQUEST_ID;
/// This middleware function allows compute_ctl to generate its own request ID
/// if one isn't supplied. The control plane will always send one as a UUID. The
/// neon Postgres extension on the other hand does not send one.
pub async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
let headers = request.headers_mut();
if !headers.contains_key(X_REQUEST_ID) {
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
}
next.run(request).await
}

View File

@@ -306,36 +306,6 @@ paths:
schema:
$ref: "#/components/schemas/GenericError"
/configure_telemetry:
post:
tags:
- Configure
summary: Configure rsyslog
description: |
This API endpoint configures rsyslog to forward Postgres logs
to a specified otel collector.
operationId: configureTelemetry
requestBody:
required: true
content:
application/json:
schema:
type: object
properties:
logs_export_host:
type: string
description: |
Hostname and the port of the otel collector. Leave empty to disable logs forwarding.
Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526
responses:
204:
description: "Telemetry configured successfully"
500:
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:
JWT:

View File

@@ -1,11 +1,9 @@
use std::sync::Arc;
use axum::body::Body;
use axum::extract::State;
use axum::response::Response;
use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
use compute_api::spec::ComputeFeature;
use http::StatusCode;
use tokio::task;
use tracing::info;
@@ -13,7 +11,6 @@ use tracing::info;
use crate::compute::{ComputeNode, ParsedSpec};
use crate::http::JsonResponse;
use crate::http::extract::Json;
use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export};
// Accept spec in JSON format and request compute configuration. If anything
// goes wrong after we set the compute status to `ConfigurationPending` and
@@ -95,25 +92,3 @@ pub(in crate::http) async fn configure(
JsonResponse::success(StatusCode::OK, body)
}
pub(in crate::http) async fn configure_telemetry(
State(compute): State<Arc<ComputeNode>>,
request: Json<ConfigureTelemetryRequest>,
) -> Response {
if !compute.has_feature(ComputeFeature::PostgresLogsExport) {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"Postgres logs export feature is not enabled".to_string(),
);
}
let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref());
if let Err(err) = configure_postgres_logs_export(conf) {
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string());
}
Response::builder()
.status(StatusCode::NO_CONTENT)
.body(Body::from(""))
.unwrap()
}

View File

@@ -5,19 +5,20 @@ use std::time::Duration;
use anyhow::Result;
use axum::Router;
use axum::middleware::{self};
use axum::response::IntoResponse;
use axum::extract::Request;
use axum::middleware::{self, Next};
use axum::response::{IntoResponse, Response};
use axum::routing::{get, post};
use compute_api::responses::ComputeCtlConfig;
use http::StatusCode;
use jsonwebtoken::jwk::JwkSet;
use tokio::net::TcpListener;
use tower::ServiceBuilder;
use tower_http::{
auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
};
use tracing::{Span, error, info};
use uuid::Uuid;
use super::middleware::request_id::maybe_add_request_id_header;
use super::{
headers::X_REQUEST_ID,
middleware::authorize::Authorize,
@@ -41,7 +42,7 @@ pub enum Server {
},
External {
port: u16,
config: ComputeCtlConfig,
jwks: JwkSet,
compute_id: String,
},
}
@@ -79,7 +80,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
router
}
Server::External {
config, compute_id, ..
jwks, compute_id, ..
} => {
let unauthenticated_router =
Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
@@ -87,7 +88,6 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/configure_telemetry", post(configure::configure_telemetry))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
@@ -96,7 +96,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
.route("/terminate", post(terminate::terminate))
.layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
compute_id.clone(),
config.jwks.clone(),
jwks.clone(),
)));
router
@@ -219,3 +219,15 @@ impl Server {
tokio::spawn(self.serve(state));
}
}
/// This middleware function allows compute_ctl to generate its own request ID
/// if one isn't supplied. The control plane will always send one as a UUID. The
/// neon Postgres extension on the other hand does not send one.
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
let headers = request.headers_mut();
if headers.get(X_REQUEST_ID).is_none() {
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
}
next.run(request).await
}

View File

@@ -2,7 +2,7 @@ use std::collections::HashMap;
use anyhow::Result;
use compute_api::responses::{InstalledExtension, InstalledExtensions};
use tokio_postgres::{Client, Config, NoTls};
use postgres::{Client, NoTls};
use crate::metrics::INSTALLED_EXTENSIONS;
@@ -10,7 +10,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
/// and to make database listing query here more explicit.
///
/// Limit the number of databases to 500 to avoid excessive load.
async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
// `pg_database.datconnlimit = -2` means that the database is in the
// invalid state
let databases = client
@@ -20,8 +20,7 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
AND datconnlimit <> - 2
LIMIT 500",
&[],
)
.await?
)?
.iter()
.map(|row| {
let db: String = row.get("datname");
@@ -37,36 +36,20 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
/// Same extension can be installed in multiple databases with different versions,
/// so we report a separate metric (number of databases where it is installed)
/// for each extension version.
pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
conf.application_name("compute_ctl:get_installed_extensions");
let databases: Vec<String> = {
let (mut client, connection) = conf.connect(NoTls).await?;
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
list_dbs(&mut client).await?
};
let mut client = conf.connect(NoTls)?;
let databases: Vec<String> = list_dbs(&mut client)?;
let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
for db in databases.iter() {
conf.dbname(db);
let (client, connection) = conf.connect(NoTls).await?;
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
let extensions: Vec<(String, String, i32)> = client
let mut db_client = conf.connect(NoTls)?;
let extensions: Vec<(String, String, i32)> = db_client
.query(
"SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
&[],
)
.await?
)?
.iter()
.map(|row| {
(

View File

@@ -26,4 +26,3 @@ pub mod spec;
mod spec_apply;
pub mod swap;
pub mod sync_sk;
pub mod tls;

View File

@@ -24,8 +24,7 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
.with_writer(std::io::stderr);
// Initialize OpenTelemetry
let otlp_layer =
tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()).await;
let otlp_layer = tracing_utils::init_tracing("compute_ctl").await;
// Put it all together
tracing_subscriber::registry()

View File

@@ -1,8 +1,6 @@
use metrics::core::{AtomicF64, Collector, GenericGauge};
use metrics::core::Collector;
use metrics::proto::MetricFamily;
use metrics::{
IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec,
};
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
use once_cell::sync::Lazy;
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
@@ -56,16 +54,9 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
register_int_counter_vec!(
"compute_ctl_remote_ext_requests_total",
"Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
&["http_status", "filename"]
)
.expect("failed to define a metric")
});
// Size of audit log directory in bytes
pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
register_gauge!(
"compute_audit_log_dir_size",
"Size of audit log directory in bytes",
// Do not use any labels like extension name yet.
// We can add them later if needed.
&["http_status"]
)
.expect("failed to define a metric")
});
@@ -75,6 +66,5 @@ pub fn collect() -> Vec<MetricFamily> {
metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
metrics.extend(DB_MIGRATION_FAILED.collect());
metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
metrics
}

View File

@@ -10,10 +10,8 @@ use std::str::FromStr;
use std::time::{Duration, Instant};
use anyhow::{Result, bail};
use compute_api::responses::TlsConfig;
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
use futures::StreamExt;
use indexmap::IndexMap;
use ini::Ini;
use notify::{RecursiveMode, Watcher};
use postgres::config::Config;
@@ -188,40 +186,15 @@ impl DatabaseExt for Database {
/// Postgres SQL queries and DATABASE_URL.
pub trait Escaping {
fn pg_quote(&self) -> String;
fn pg_quote_dollar(&self) -> (String, String);
}
impl Escaping for PgIdent {
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
/// always quotes provided string with `""` and escapes every `"`.
/// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
/// N.B. it's not useful for escaping identifiers that are used inside WHERE
/// clause, use `escape_literal()` instead.
fn pg_quote(&self) -> String {
format!("\"{}\"", self.replace('"', "\"\""))
}
/// This helper is intended to be used for dollar-escaping strings for usage
/// inside PL/pgSQL procedures. In addition to dollar-escaping the string,
/// it also returns a tag that is intended to be used inside the outer
/// PL/pgSQL procedure. If you do not need an outer tag, just discard it.
/// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`,
/// <https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L2924>
fn pg_quote_dollar(&self) -> (String, String) {
let mut tag: String = "x".to_string();
let mut outer_tag = "xx".to_string();
// Find the first suitable tag that is not present in the string.
// Postgres' max role/DB name length is 63 bytes, so even in the
// worst case it won't take long.
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
tag += "x";
outer_tag = tag.clone() + "x";
}
let escaped = format!("${tag}${self}${tag}$");
(escaped, outer_tag)
let result = format!("\"{}\"", self.replace('"', "\"\""));
result
}
}
@@ -253,13 +226,10 @@ pub async fn get_existing_dbs_async(
// invalid state. See:
// https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
let rowstream = client
// We use a subquery instead of a fancy `datdba::regrole::text AS owner`,
// because the latter automatically wraps the result in double quotes,
// if the role name contains special characters.
.query_raw::<str, &String, &[String; 0]>(
"SELECT
datname AS name,
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
datdba::regrole::text AS owner,
NOT datallowconn AS restrict_conn,
datconnlimit = - 2 AS invalid
FROM
@@ -408,7 +378,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
/// Update pgbouncer.ini with provided options
fn update_pgbouncer_ini(
pgbouncer_config: IndexMap<String, String>,
pgbouncer_config: HashMap<String, String>,
pgbouncer_ini_path: &str,
) -> Result<()> {
let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
@@ -429,10 +399,7 @@ fn update_pgbouncer_ini(
/// Tune pgbouncer.
/// 1. Apply new config using pgbouncer admin console
/// 2. Add new values to pgbouncer.ini to preserve them after restart
pub async fn tune_pgbouncer(
mut pgbouncer_config: IndexMap<String, String>,
tls_config: Option<TlsConfig>,
) -> Result<()> {
pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
// for VMs use pgbouncer specific way to connect to
// pgbouncer admin console without password
@@ -478,21 +445,19 @@ pub async fn tune_pgbouncer(
}
};
if let Some(tls_config) = tls_config {
// pgbouncer starts in a half-ok state if it cannot find these files.
// It will default to client_tls_sslmode=deny, which causes proxy to error.
// There is a small window at startup where these files don't yet exist in the VM.
// Best to wait until it exists.
loop {
if let Ok(true) = tokio::fs::try_exists(&tls_config.key_path).await {
break;
}
tokio::time::sleep(Duration::from_millis(500)).await
}
// Apply new config
for (option_name, value) in pgbouncer_config.iter() {
let query = format!("SET {}={}", option_name, value);
// keep this log line for debugging purposes
info!("Applying pgbouncer setting change: {}", query);
pgbouncer_config.insert("client_tls_cert_file".to_string(), tls_config.cert_path);
pgbouncer_config.insert("client_tls_key_file".to_string(), tls_config.key_path);
pgbouncer_config.insert("client_tls_sslmode".to_string(), "allow".to_string());
if let Err(err) = client.simple_query(&query).await {
// Don't fail on error, just print it into log
error!(
"Failed to apply pgbouncer setting change: {}, {}",
query, err
);
};
}
// save values to pgbouncer.ini
@@ -508,13 +473,6 @@ pub async fn tune_pgbouncer(
};
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
info!("Applying pgbouncer setting change");
if let Err(err) = client.simple_query("RELOAD").await {
// Don't fail on error, just print it into log
error!("Failed to apply pgbouncer setting change, {err}",);
};
Ok(())
}

View File

@@ -1,14 +1,8 @@
use std::fs;
use std::io::ErrorKind;
use std::path::Path;
use std::process::Command;
use std::time::Duration;
use std::{fs::OpenOptions, io::Write};
use anyhow::{Context, Result, anyhow};
use tracing::{error, info, instrument, warn};
const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf";
use anyhow::{Context, Result};
use tracing::info;
fn get_rsyslog_pid() -> Option<String> {
let output = Command::new("pgrep")
@@ -27,7 +21,8 @@ fn get_rsyslog_pid() -> Option<String> {
}
}
// Restart rsyslogd to apply the new configuration.
// Start rsyslogd with the specified configuration file
// If it is already running - restart it.
// This is necessary, because there is no other way to reload the rsyslog configuration.
//
// Rsyslogd shouldn't lose any messages, because of the restart,
@@ -35,26 +30,42 @@ fn get_rsyslog_pid() -> Option<String> {
// and will continue reading from that position.
// TODO: test it properly
//
fn restart_rsyslog() -> Result<()> {
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
info!("rsyslogd is running with pid: {}, restart it", old_pid);
fn start_rsyslog(rsyslog_conf_path: &str) -> Result<()> {
let old_pid = get_rsyslog_pid();
if let Some(pid) = old_pid {
info!("rsyslogd is already running with pid: {}, restart it", pid);
// kill it to restart
let _ = Command::new("pkill")
.arg("rsyslogd")
.output()
.context("Failed to stop rsyslogd")?;
}
// kill it to restart
let _ = Command::new("pkill")
.arg("rsyslogd")
let _ = Command::new("/usr/sbin/rsyslogd")
.arg("-f")
.arg(rsyslog_conf_path)
.arg("-i")
.arg("/var/run/rsyslogd/rsyslogd.pid")
.output()
.context("Failed to stop rsyslogd")?;
.context("Failed to start rsyslogd")?;
// Check that rsyslogd is running
if let Some(pid) = get_rsyslog_pid() {
info!("rsyslogd started successfully with pid: {}", pid);
} else {
return Err(anyhow::anyhow!("Failed to start rsyslogd"));
}
Ok(())
}
pub fn configure_audit_rsyslog(
log_directory: String,
pub fn configure_and_start_rsyslog(
log_directory: &str,
tag: &str,
remote_endpoint: &str,
) -> Result<()> {
let config_content: String = format!(
include_str!("config_template/compute_audit_rsyslog_template.conf"),
include_str!("config_template/compute_rsyslog_template.conf"),
log_directory = log_directory,
tag = tag,
remote_endpoint = remote_endpoint
@@ -62,7 +73,7 @@ pub fn configure_audit_rsyslog(
info!("rsyslog config_content: {}", config_content);
let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf";
let rsyslog_conf_path = "/etc/compute_rsyslog.conf";
let mut file = OpenOptions::new()
.create(true)
.write(true)
@@ -71,206 +82,10 @@ pub fn configure_audit_rsyslog(
file.write_all(config_content.as_bytes())?;
info!(
"rsyslog configuration file {} added successfully. Starting rsyslogd",
rsyslog_conf_path
);
info!("rsyslog configuration added successfully. Starting rsyslogd");
// start the service, using the configuration
restart_rsyslog()?;
start_rsyslog(rsyslog_conf_path)?;
Ok(())
}
/// Configuration for enabling Postgres logs forwarding from rsyslogd
pub struct PostgresLogsRsyslogConfig<'a> {
pub host: Option<&'a str>,
}
impl<'a> PostgresLogsRsyslogConfig<'a> {
pub fn new(host: Option<&'a str>) -> Self {
Self { host }
}
pub fn build(&self) -> Result<String> {
match self.host {
Some(host) => {
if let Some((target, port)) = host.split_once(":") {
Ok(format!(
include_str!(
"config_template/compute_rsyslog_postgres_export_template.conf"
),
logs_export_target = target,
logs_export_port = port,
))
} else {
Err(anyhow!("Invalid host format for Postgres logs export"))
}
}
None => Ok("".to_string()),
}
}
fn current_config() -> Result<String> {
let config_content = match std::fs::read_to_string(POSTGRES_LOGS_CONF_PATH) {
Ok(c) => c,
Err(err) if err.kind() == ErrorKind::NotFound => String::new(),
Err(err) => return Err(err.into()),
};
Ok(config_content)
}
/// Returns the default host for otel collector that receives Postgres logs
pub fn default_host(project_id: &str) -> String {
format!(
"config-{}-collector.neon-telemetry.svc.cluster.local:10514",
project_id
)
}
}
pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> {
let new_config = conf.build()?;
let current_config = PostgresLogsRsyslogConfig::current_config()?;
if new_config == current_config {
info!("postgres logs rsyslog configuration is up-to-date");
return Ok(());
}
// When new config is empty we can simply remove the configuration file.
if new_config.is_empty() {
info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH);
match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) {
Ok(_) => {}
Err(err) if err.kind() == ErrorKind::NotFound => {}
Err(err) => return Err(err.into()),
}
restart_rsyslog()?;
return Ok(());
}
info!(
"configuring rsyslog for postgres logs export to: {:?}",
conf.host
);
let mut file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(POSTGRES_LOGS_CONF_PATH)?;
file.write_all(new_config.as_bytes())?;
info!(
"rsyslog configuration file {} added successfully. Starting rsyslogd",
POSTGRES_LOGS_CONF_PATH
);
restart_rsyslog()?;
Ok(())
}
#[instrument(skip_all)]
async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> {
info!("running pgaudit GC main loop");
loop {
// Check log_directory for old pgaudit logs and delete them.
// New log files are checked every 5 minutes, as set in pgaudit.log_rotation_age
// Find files that were not modified in the last 15 minutes and delete them.
// This should be enough time for rsyslog to process the logs and for us to catch the alerts.
//
// In case of a very high load, we might need to adjust this value and pgaudit.log_rotation_age.
//
// TODO: add some smarter logic to delete the files that are fully streamed according to rsyslog
// imfile-state files, but for now just do a simple GC to avoid filling up the disk.
let _ = Command::new("find")
.arg(&log_directory)
.arg("-name")
.arg("audit*.log")
.arg("-mmin")
.arg("+15")
.arg("-delete")
.output()?;
// also collect the metric for the size of the log directory
async fn get_log_files_size(path: &Path) -> Result<u64> {
let mut total_size = 0;
for entry in fs::read_dir(path)? {
let entry = entry?;
let entry_path = entry.path();
if entry_path.is_file() && entry_path.to_string_lossy().ends_with("log") {
total_size += entry.metadata()?.len();
}
}
Ok(total_size)
}
let log_directory_size = get_log_files_size(Path::new(&log_directory))
.await
.unwrap_or_else(|e| {
warn!("Failed to get log directory size: {}", e);
0
});
crate::metrics::AUDIT_LOG_DIR_SIZE.set(log_directory_size as f64);
tokio::time::sleep(Duration::from_secs(60)).await;
}
}
// launch pgaudit GC thread to clean up the old pgaudit logs stored in the log_directory
pub fn launch_pgaudit_gc(log_directory: String) {
tokio::spawn(async move {
if let Err(e) = pgaudit_gc_main_loop(log_directory).await {
error!("pgaudit GC main loop failed: {}", e);
}
});
}
#[cfg(test)]
mod tests {
use crate::rsyslog::PostgresLogsRsyslogConfig;
#[test]
fn test_postgres_logs_config() {
{
// Verify empty config
let conf = PostgresLogsRsyslogConfig::new(None);
let res = conf.build();
assert!(res.is_ok());
let conf_str = res.unwrap();
assert_eq!(&conf_str, "");
}
{
// Verify config
let conf = PostgresLogsRsyslogConfig::new(Some("collector.cvc.local:514"));
let res = conf.build();
assert!(res.is_ok());
let conf_str = res.unwrap();
assert!(conf_str.contains("omfwd"));
assert!(conf_str.contains(r#"target="collector.cvc.local""#));
assert!(conf_str.contains(r#"port="514""#));
}
{
// Verify invalid config
let conf = PostgresLogsRsyslogConfig::new(Some("invalid"));
let res = conf.build();
assert!(res.is_err());
}
{
// Verify config with default host
let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123");
let conf = PostgresLogsRsyslogConfig::new(Some(&host));
let res = conf.build();
assert!(res.is_ok());
let conf_str = res.unwrap();
assert!(conf_str.contains(r#"shy-breeze-123"#));
assert!(conf_str.contains(r#"port="10514""#));
}
}
}

View File

@@ -8,12 +8,13 @@ use compute_api::responses::{
use compute_api::spec::ComputeSpec;
use reqwest::StatusCode;
use tokio_postgres::Client;
use tracing::{error, info, instrument};
use tracing::{error, info, instrument, warn};
use crate::config;
use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS};
use crate::migration::MigrationRunner;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
// Do control plane request and return response if any. In case of error it
// returns a bool flag indicating whether it makes sense to retry the request
@@ -211,3 +212,122 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
Ok(())
}
/// Connect to the database as superuser and pre-create anon extension
/// if it is present in shared_preload_libraries
#[instrument(skip_all)]
pub async fn handle_extension_anon(
spec: &ComputeSpec,
db_owner: &str,
db_client: &mut Client,
grants_only: bool,
) -> Result<()> {
info!("handle extension anon");
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
if libs.contains("anon") {
if !grants_only {
// check if extension is already initialized using anon.is_initialized()
let query = "SELECT anon.is_initialized()";
match db_client.query(query, &[]).await {
Ok(rows) => {
if !rows.is_empty() {
let is_initialized: bool = rows[0].get(0);
if is_initialized {
info!("anon extension is already initialized");
return Ok(());
}
}
}
Err(e) => {
warn!(
"anon extension is_installed check failed with expected error: {}",
e
);
}
};
// Create anon extension if this compute needs it
// Users cannot create it themselves, because superuser is required.
let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
info!("creating anon extension with query: {}", query);
match db_client.query(query, &[]).await {
Ok(_) => {}
Err(e) => {
error!("anon extension creation failed with error: {}", e);
return Ok(());
}
}
// check that extension is installed
query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
let rows = db_client.query(query, &[]).await?;
if rows.is_empty() {
error!("anon extension is not installed");
return Ok(());
}
// Initialize anon extension
// This also requires superuser privileges, so users cannot do it themselves.
query = "SELECT anon.init()";
match db_client.query(query, &[]).await {
Ok(_) => {}
Err(e) => {
error!("anon.init() failed with error: {}", e);
return Ok(());
}
}
}
// check that extension is installed, if not bail early
let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
match db_client.query(query, &[]).await {
Ok(rows) => {
if rows.is_empty() {
error!("anon extension is not installed");
return Ok(());
}
}
Err(e) => {
error!("anon extension check failed with error: {}", e);
return Ok(());
}
};
let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query).await?;
// Grant permissions to db_owner to use anon extension functions
let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query).await?;
// This is needed, because some functions are defined as SECURITY DEFINER.
// In Postgres SECURITY DEFINER functions are executed with the privileges
// of the owner.
// In anon extension this it is needed to access some GUCs, which are only accessible to
// superuser. But we've patched postgres to allow db_owner to access them as well.
// So we need to change owner of these functions to db_owner.
let query = format!("
SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};'
from pg_proc p
join pg_namespace nsp ON p.pronamespace = nsp.oid
where nsp.nspname = 'anon';", db_owner);
info!("change anon extension functions owner to db owner");
db_client.simple_query(&query).await?;
// affects views as well
let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query).await?;
let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query).await?;
}
}
Ok(())
}

View File

@@ -6,27 +6,26 @@ use std::sync::Arc;
use anyhow::{Context, Result};
use compute_api::responses::ComputeStatus;
use compute_api::spec::{ComputeAudit, ComputeSpec, Database, PgIdent, Role};
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role};
use futures::future::join_all;
use tokio::sync::RwLock;
use tokio_postgres::Client;
use tokio_postgres::error::SqlState;
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
use crate::compute::{ComputeNode, ComputeState};
use crate::compute::{ComputeNode, ComputeState, construct_superuser_query};
use crate::pg_helpers::{
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async,
get_existing_roles_async,
};
use crate::spec_apply::ApplySpecPhase::{
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
RunInEachDatabase,
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
CreatePgauditlogtofileExtension, CreateSchemaNeon, CreateSuperUser, DisablePostgresDBPgAudit,
DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, HandleNeonExtension,
HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
};
use crate::spec_apply::PerDatabasePhase::{
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions,
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
};
impl ComputeNode {
@@ -188,7 +187,7 @@ impl ComputeNode {
}
for phase in [
CreateNeonSuperuser,
CreateSuperUser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -238,6 +237,7 @@ impl ComputeNode {
let mut phases = vec![
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
];
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
@@ -457,6 +457,7 @@ impl Debug for DB {
pub enum PerDatabasePhase {
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
/// This is a shared phase, used for both i) dropping dangling LR subscriptions
/// before dropping the DB, and ii) dropping all subscriptions after creating
/// a fresh branch.
@@ -467,7 +468,7 @@ pub enum PerDatabasePhase {
#[derive(Clone, Debug)]
pub enum ApplySpecPhase {
CreateNeonSuperuser,
CreateSuperUser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -594,10 +595,14 @@ async fn get_operations<'a>(
apply_spec_phase: &'a ApplySpecPhase,
) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
match apply_spec_phase {
ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
query: include_str!("sql/create_neon_superuser.sql").to_string(),
comment: None,
}))),
ApplySpecPhase::CreateSuperUser => {
let query = construct_superuser_query(spec);
Ok(Box::new(once(Operation {
query,
comment: None,
})))
}
ApplySpecPhase::DropInvalidDatabases => {
let mut ctx = ctx.write().await;
let databases = &mut ctx.dbs;
@@ -731,15 +736,14 @@ async fn get_operations<'a>(
// We do not check whether the DB exists or not,
// Postgres will take care of it for us
"delete_db" => {
let (db_name, outer_tag) = op.name.pg_quote_dollar();
// In Postgres we can't drop a database if it is a template.
// So we need to unset the template flag first, but it could
// be a retry, so we could've already dropped the database.
// Check that database exists first to make it idempotent.
let unset_template_query: String = format!(
include_str!("sql/unset_template_for_drop_dbs.sql"),
datname = db_name,
outer_tag = outer_tag,
datname_str = escape_literal(&op.name),
datname = &op.name.pg_quote()
);
// Use FORCE to drop database even if there are active connections.
@@ -846,8 +850,6 @@ async fn get_operations<'a>(
comment: None,
},
Operation {
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
query: format!(
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
db.name.pg_quote()
@@ -907,11 +909,9 @@ async fn get_operations<'a>(
PerDatabasePhase::DropLogicalSubscriptions => {
match &db {
DB::UserDB(db) => {
let (db_name, outer_tag) = db.name.pg_quote_dollar();
let drop_subscription_query: String = format!(
include_str!("sql/drop_subscriptions.sql"),
datname_str = db_name,
outer_tag = outer_tag,
datname_str = escape_literal(&db.name),
);
let operations = vec![Operation {
@@ -950,7 +950,6 @@ async fn get_operations<'a>(
DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
DB::UserDB(db) => db.owner.pg_quote(),
};
let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
Some(vec![
// This will reassign all dependent objects to the db owner
@@ -965,9 +964,7 @@ async fn get_operations<'a>(
Operation {
query: format!(
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
role_name = escaped_role,
outer_tag = outer_tag,
role_name = quoted,
),
comment: None,
},
@@ -992,14 +989,12 @@ async fn get_operations<'a>(
DB::SystemDB => return Ok(Box::new(empty())),
DB::UserDB(db) => db,
};
let (db_owner, outer_tag) = db.owner.pg_quote_dollar();
let operations = vec![
Operation {
query: format!(
include_str!("sql/set_public_schema_owner.sql"),
db_owner = db_owner,
outer_tag = outer_tag,
db_owner = db.owner.pg_quote()
),
comment: None,
},
@@ -1010,6 +1005,98 @@ async fn get_operations<'a>(
]
.into_iter();
Ok(Box::new(operations))
}
// TODO: remove this completely https://github.com/neondatabase/cloud/issues/22663
PerDatabasePhase::HandleAnonExtension => {
// Only install Anon into user databases
let db = match &db {
DB::SystemDB => return Ok(Box::new(empty())),
DB::UserDB(db) => db,
};
// Never install Anon when it's not enabled as feature
if !spec.features.contains(&ComputeFeature::AnonExtension) {
return Ok(Box::new(empty()));
}
// Only install Anon when it's added in preload libraries
let opt_libs = spec.cluster.settings.find("shared_preload_libraries");
let libs = match opt_libs {
Some(libs) => libs,
None => return Ok(Box::new(empty())),
};
if !libs.contains("anon") {
return Ok(Box::new(empty()));
}
let db_owner = db.owner.pg_quote();
let operations = vec![
// Create anon extension if this compute needs it
// Users cannot create it themselves, because superuser is required.
Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS anon CASCADE"),
comment: Some(String::from("creating anon extension")),
},
// Initialize anon extension
// This also requires superuser privileges, so users cannot do it themselves.
Operation {
query: String::from("SELECT anon.init()"),
comment: Some(String::from("initializing anon extension data")),
},
Operation {
query: format!("GRANT ALL ON SCHEMA anon TO {}", db_owner),
comment: Some(String::from(
"granting anon extension schema permissions",
)),
},
Operation {
query: format!(
"GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}",
db_owner
),
comment: Some(String::from(
"granting anon extension schema functions permissions",
)),
},
// We need this, because some functions are defined as SECURITY DEFINER.
// In Postgres SECURITY DEFINER functions are executed with the privileges
// of the owner.
// In anon extension this it is needed to access some GUCs, which are only accessible to
// superuser. But we've patched postgres to allow db_owner to access them as well.
// So we need to change owner of these functions to db_owner.
Operation {
query: format!(
include_str!("sql/anon_ext_fn_reassign.sql"),
db_owner = db_owner,
),
comment: Some(String::from(
"change anon extension functions owner to database_owner",
)),
},
Operation {
query: format!(
"GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}",
db_owner,
),
comment: Some(String::from(
"granting anon extension tables permissions",
)),
},
Operation {
query: format!(
"GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}",
db_owner,
),
comment: Some(String::from(
"granting anon extension sequences permissions",
)),
},
]
.into_iter();
Ok(Box::new(operations))
}
}

View File

@@ -1,8 +0,0 @@
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
END IF;
END
$$;

View File

@@ -1,4 +1,4 @@
DO ${outer_tag}$
DO $$
DECLARE
subname TEXT;
BEGIN
@@ -9,4 +9,4 @@ BEGIN
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
END LOOP;
END;
${outer_tag}$;
$$;

View File

@@ -1,7 +1,8 @@
DO ${outer_tag}$
SET SESSION ROLE neon_superuser;
DO $$
DECLARE
schema TEXT;
grantor TEXT;
revoke_query TEXT;
BEGIN
FOR schema IN
@@ -14,25 +15,14 @@ BEGIN
-- ii) it's easy to add more schemas to the list if needed.
WHERE schema_name IN ('public')
LOOP
FOR grantor IN EXECUTE
format(
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
quote_literal({role_name})
)
LOOP
EXECUTE format('SET LOCAL ROLE %I', grantor);
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
schema
);
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
schema,
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
{role_name},
grantor
);
EXECUTE revoke_query;
END LOOP;
EXECUTE revoke_query;
END LOOP;
END;
${outer_tag}$;
$$;
RESET ROLE;

View File

@@ -1,4 +1,5 @@
DO ${outer_tag}$
DO
$$
DECLARE
schema_owner TEXT;
BEGIN
@@ -15,8 +16,8 @@ DO ${outer_tag}$
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
THEN
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
ALTER SCHEMA public OWNER TO {db_owner};
END IF;
END IF;
END
${outer_tag}$;
$$;

View File

@@ -1,12 +1,12 @@
DO ${outer_tag}$
DO $$
BEGIN
IF EXISTS(
SELECT 1
FROM pg_catalog.pg_database
WHERE datname = {datname}
WHERE datname = {datname_str}
)
THEN
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
ALTER DATABASE {datname} is_template false;
END IF;
END
${outer_tag}$;
$$;

View File

@@ -1,118 +0,0 @@
use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration};
use anyhow::{Context, Result, bail};
use compute_api::responses::TlsConfig;
use ring::digest;
use spki::ObjectIdentifier;
use spki::der::{Decode, PemReader};
use x509_cert::Certificate;
#[derive(Clone, Copy)]
pub struct CertDigest(digest::Digest);
pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver<CertDigest> {
let mut digest = compute_digest(&cert_path).await;
let (tx, rx) = tokio::sync::watch::channel(digest);
tokio::spawn(async move {
while !tx.is_closed() {
let new_digest = compute_digest(&cert_path).await;
if digest.0.as_ref() != new_digest.0.as_ref() {
digest = new_digest;
_ = tx.send(digest);
}
tokio::time::sleep(Duration::from_secs(60)).await
}
});
rx
}
async fn compute_digest(cert_path: &str) -> CertDigest {
loop {
match try_compute_digest(cert_path).await {
Ok(d) => break d,
Err(e) => {
tracing::error!("could not read cert file {e:?}");
tokio::time::sleep(Duration::from_secs(1)).await
}
}
}
}
async fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
let data = tokio::fs::read(cert_path).await?;
// sha256 is extremely collision resistent. can safely assume the digest to be unique
Ok(CertDigest(digest::digest(&digest::SHA256, &data)))
}
pub const SERVER_CRT: &str = "server.crt";
pub const SERVER_KEY: &str = "server.key";
pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) {
loop {
match try_update_key_path_blocking(pg_data, tls_config) {
Ok(()) => break,
Err(e) => {
tracing::error!("could not create key file {e:?}");
std::thread::sleep(Duration::from_secs(1))
}
}
}
}
// Postgres requires the keypath be "secure". This means
// 1. Owned by the postgres user.
// 2. Have permission 600.
fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> {
let key = std::fs::read_to_string(&tls_config.key_path)?;
let crt = std::fs::read_to_string(&tls_config.cert_path)?;
// to mitigate a race condition during renewal.
verify_key_cert(&key, &crt)?;
let mut key_file = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.mode(0o600)
.open(pg_data.join(SERVER_KEY))?;
let mut crt_file = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.mode(0o600)
.open(pg_data.join(SERVER_CRT))?;
key_file.write_all(key.as_bytes())?;
crt_file.write_all(crt.as_bytes())?;
Ok(())
}
fn verify_key_cert(key: &str, cert: &str) -> Result<()> {
const ECDSA_WITH_SHA256: ObjectIdentifier = ObjectIdentifier::new_unwrap("1.2.840.10045.4.3.2");
let cert = Certificate::decode(&mut PemReader::new(cert.as_bytes()).context("pem reader")?)
.context("decode cert")?;
match cert.signature_algorithm.oid {
ECDSA_WITH_SHA256 => {
let key = p256::SecretKey::from_sec1_pem(key).context("parse key")?;
let a = key.public_key().to_sec1_bytes();
let b = cert
.tbs_certificate
.subject_public_key_info
.subject_public_key
.raw_bytes();
if *a != *b {
bail!("private key file does not match certificate")
}
}
_ => bail!("unknown TLS key type"),
}
Ok(())
}

View File

@@ -61,24 +61,6 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
}
#[test]
fn ident_pg_quote_dollar() {
let test_cases = vec![
("name", ("$x$name$x$", "xx")),
("name$", ("$x$name$$x$", "xx")),
("name$$", ("$x$name$$$x$", "xx")),
("name$$$", ("$x$name$$$$x$", "xx")),
("name$$$$", ("$x$name$$$$$x$", "xx")),
("name$x$", ("$xx$name$x$$xx$", "xxx")),
];
for (input, expected) in test_cases {
let (escaped, tag) = PgIdent::from(input).pg_quote_dollar();
assert_eq!(escaped, expected.0);
assert_eq!(tag, expected.1);
}
}
#[test]
fn generic_options_search() {
let generic_options: GenericOptions = Some(vec![

View File

@@ -36,9 +36,7 @@ use pageserver_api::config::{
use pageserver_api::controller_api::{
NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest,
};
use pageserver_api::models::{
ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
@@ -965,7 +963,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
id: pageserver_id,
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
listen_http_addr: format!("127.0.0.1:{http_port}"),
listen_https_addr: None,
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
other: Default::default(),
@@ -979,8 +976,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
neon_distrib_dir: None,
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
storage_controller: None,
control_plane_hooks_api: None,
generate_local_ssl_certs: false,
control_plane_compute_hook_api: None,
}
};
@@ -1131,16 +1127,12 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
let tenant_id = get_tenant_id(args.tenant_id, env)?;
let tenant_conf: HashMap<_, _> =
args.config.iter().flat_map(|c| c.split_once(':')).collect();
let config = PageServerNode::parse_config(tenant_conf)?;
let req = TenantConfigRequest { tenant_id, config };
let storage_controller = StorageController::from_env(env);
storage_controller
.set_tenant_config(&req)
pageserver
.tenant_config(tenant_id, tenant_conf)
.await
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
println!("tenant {tenant_id} successfully configured via storcon");
println!("tenant {tenant_id} successfully configured on the pageserver");
}
}
Ok(())

View File

@@ -72,19 +72,15 @@ pub struct LocalEnv {
// be propagated into each pageserver's configuration.
pub control_plane_api: Url,
// Control plane upcall APIs for storage controller. If set, this will be propagated into the
// Control plane upcall API for storage controller. If set, this will be propagated into the
// storage controller's configuration.
pub control_plane_hooks_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
/// Flag to generate SSL certificates for components that need it.
/// Also generates root CA certificate that is used to sign all other certificates.
pub generate_local_ssl_certs: bool,
}
/// On-disk state stored in `.neon/config`.
@@ -104,13 +100,8 @@ pub struct OnDiskConfig {
pub pageservers: Vec<PageServerConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
// Note: skip serializing because in compat tests old storage controller fails
// to load new config file. May be removed after this field is in release branch.
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub generate_local_ssl_certs: bool,
}
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
@@ -137,8 +128,7 @@ pub struct NeonLocalInitConf {
pub pageservers: Vec<NeonLocalInitPageserverConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub generate_local_ssl_certs: bool,
pub control_plane_compute_hook_api: Option<Option<Url>>,
}
/// Broker config for cluster internal communication.
@@ -149,7 +139,7 @@ pub struct NeonBroker {
pub listen_addr: SocketAddr,
}
/// A part of storage controller's config the neon_local knows about.
/// Broker config for cluster internal communication.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct NeonStorageControllerConf {
@@ -176,11 +166,7 @@ pub struct NeonStorageControllerConf {
#[serde(with = "humantime_serde")]
pub long_reconcile_threshold: Option<Duration>,
pub use_https_pageserver_api: bool,
pub timelines_onto_safekeepers: bool,
pub use_https_safekeeper_api: bool,
pub load_safekeepers: bool,
}
impl NeonStorageControllerConf {
@@ -204,9 +190,7 @@ impl Default for NeonStorageControllerConf {
max_secondary_lag_bytes: None,
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
long_reconcile_threshold: None,
use_https_pageserver_api: false,
timelines_onto_safekeepers: false,
use_https_safekeeper_api: false,
load_safekeepers: true,
}
}
}
@@ -236,7 +220,6 @@ pub struct PageServerConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub no_sync: bool,
@@ -248,7 +231,6 @@ impl Default for PageServerConf {
id: NodeId(0),
listen_pg_addr: String::new(),
listen_http_addr: String::new(),
listen_https_addr: None,
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
no_sync: false,
@@ -264,7 +246,6 @@ pub struct NeonLocalInitPageserverConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
@@ -279,7 +260,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
id,
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -289,7 +269,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
id: *id,
listen_pg_addr: listen_pg_addr.clone(),
listen_http_addr: listen_http_addr.clone(),
listen_https_addr: listen_https_addr.clone(),
pg_auth_type: *pg_auth_type,
http_auth_type: *http_auth_type,
no_sync: *no_sync,
@@ -304,7 +283,6 @@ pub struct SafekeeperConf {
pub pg_port: u16,
pub pg_tenant_only_port: Option<u16>,
pub http_port: u16,
pub https_port: Option<u16>,
pub sync: bool,
pub remote_storage: Option<String>,
pub backup_threads: Option<u32>,
@@ -319,7 +297,6 @@ impl Default for SafekeeperConf {
pg_port: 0,
pg_tenant_only_port: None,
http_port: 0,
https_port: None,
sync: true,
remote_storage: None,
backup_threads: None,
@@ -436,41 +413,6 @@ impl LocalEnv {
}
}
pub fn ssl_ca_cert_path(&self) -> Option<PathBuf> {
if self.generate_local_ssl_certs {
Some(self.base_data_dir.join("rootCA.crt"))
} else {
None
}
}
pub fn ssl_ca_key_path(&self) -> Option<PathBuf> {
if self.generate_local_ssl_certs {
Some(self.base_data_dir.join("rootCA.key"))
} else {
None
}
}
pub fn generate_ssl_ca_cert(&self) -> anyhow::Result<()> {
let cert_path = self.ssl_ca_cert_path().unwrap();
let key_path = self.ssl_ca_key_path().unwrap();
if !fs::exists(cert_path.as_path())? {
generate_ssl_ca_cert(cert_path.as_path(), key_path.as_path())?;
}
Ok(())
}
pub fn generate_ssl_cert(&self, cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
self.generate_ssl_ca_cert()?;
generate_ssl_cert(
cert_path,
key_path,
self.ssl_ca_cert_path().unwrap().as_path(),
self.ssl_ca_key_path().unwrap().as_path(),
)
}
/// Inspect the base data directory and extract the instance id and instance directory path
/// for all storage controller instances
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
@@ -578,10 +520,8 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api,
control_plane_hooks_api,
control_plane_compute_hook_api: _,
control_plane_compute_hook_api,
branch_name_mappings,
generate_local_ssl_certs,
} = on_disk_config;
LocalEnv {
base_data_dir: repopath.to_owned(),
@@ -594,9 +534,8 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api: control_plane_api.unwrap(),
control_plane_hooks_api,
control_plane_compute_hook_api,
branch_name_mappings,
generate_local_ssl_certs,
}
};
@@ -632,7 +571,6 @@ impl LocalEnv {
struct PageserverConfigTomlSubset {
listen_pg_addr: String,
listen_http_addr: String,
listen_https_addr: Option<String>,
pg_auth_type: AuthType,
http_auth_type: AuthType,
#[serde(default)]
@@ -657,7 +595,6 @@ impl LocalEnv {
let PageserverConfigTomlSubset {
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -675,7 +612,6 @@ impl LocalEnv {
},
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -701,10 +637,8 @@ impl LocalEnv {
pageservers: vec![], // it's skip_serializing anyway
safekeepers: self.safekeepers.clone(),
control_plane_api: Some(self.control_plane_api.clone()),
control_plane_hooks_api: self.control_plane_hooks_api.clone(),
control_plane_compute_hook_api: None,
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
branch_name_mappings: self.branch_name_mappings.clone(),
generate_local_ssl_certs: self.generate_local_ssl_certs,
},
)
}
@@ -786,8 +720,7 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api,
generate_local_ssl_certs,
control_plane_hooks_api,
control_plane_compute_hook_api,
} = conf;
// Find postgres binaries.
@@ -834,24 +767,16 @@ impl LocalEnv {
pageservers: pageservers.iter().map(Into::into).collect(),
safekeepers,
control_plane_api: control_plane_api.unwrap(),
control_plane_hooks_api,
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
branch_name_mappings: Default::default(),
generate_local_ssl_certs,
};
if generate_local_ssl_certs {
env.generate_ssl_ca_cert()?;
}
// create endpoints dir
fs::create_dir_all(env.endpoints_path())?;
// create safekeeper dirs
for safekeeper in &env.safekeepers {
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
SafekeeperNode::from_env(&env, safekeeper)
.initialize()
.context("safekeeper init failed")?;
}
// initialize pageserver state
@@ -929,80 +854,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
}
Ok(())
}
fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
// openssl req -x509 -newkey rsa:2048 -nodes -subj "/CN=Neon Local CA" -days 36500 \
// -out rootCA.crt -keyout rootCA.key
let keygen_output = Command::new("openssl")
.args([
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
])
.args(["-subj", "/CN=Neon Local CA"])
.args(["-out", cert_path.to_str().unwrap()])
.args(["-keyout", key_path.to_str().unwrap()])
.output()
.context("failed to generate CA certificate")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
Ok(())
}
fn generate_ssl_cert(
cert_path: &Path,
key_path: &Path,
ca_cert_path: &Path,
ca_key_path: &Path,
) -> anyhow::Result<()> {
// Generate Certificate Signing Request (CSR).
let mut csr_path = cert_path.to_path_buf();
csr_path.set_extension(".csr");
// openssl req -new -nodes -newkey rsa:2048 -keyout server.key -out server.csr \
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
let keygen_output = Command::new("openssl")
.args(["req", "-new", "-nodes"])
.args(["-newkey", "rsa:2048"])
.args(["-subj", "/CN=localhost"])
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
.args(["-keyout", key_path.to_str().unwrap()])
.args(["-out", csr_path.to_str().unwrap()])
.output()
.context("failed to generate CSR")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
// Sign CSR with CA key.
//
// openssl x509 -req -in server.csr -CA rootCA.crt -CAkey rootCA.key -CAcreateserial \
// -out server.crt -days 36500 -copy_extensions copyall
let keygen_output = Command::new("openssl")
.args(["x509", "-req"])
.args(["-in", csr_path.to_str().unwrap()])
.args(["-CA", ca_cert_path.to_str().unwrap()])
.args(["-CAkey", ca_key_path.to_str().unwrap()])
.arg("-CAcreateserial")
.args(["-out", cert_path.to_str().unwrap()])
.args(["-days", "36500"])
.args(["-copy_extensions", "copyall"])
.output()
.context("failed to sign CSR")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
// Remove CSR file as it's not needed anymore.
fs::remove_file(csr_path)?;
Ok(())
}

View File

@@ -21,7 +21,6 @@ use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
use postgres_connection::{PgConnectionConfig, parse_host_port};
use reqwest::Certificate;
use utils::auth::{Claims, Scope};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -50,29 +49,12 @@ impl PageServerNode {
let (host, port) =
parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let port = port.unwrap_or(5432);
let ssl_ca_cert = env.ssl_ca_cert_path().map(|ssl_ca_file| {
let buf = std::fs::read(ssl_ca_file).expect("SSL root CA file should exist");
Certificate::from_pem(&buf).expect("CA certificate should be valid")
});
let endpoint = if env.storage_controller.use_https_pageserver_api {
format!(
"https://{}",
conf.listen_https_addr.as_ref().expect(
"listen https address should be specified if use_https_pageserver_api is on"
)
)
} else {
format!("http://{}", conf.listen_http_addr)
};
Self {
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
conf: conf.clone(),
env: env.clone(),
http_client: mgmt_api::Client::new(
endpoint,
format!("http://{}", conf.listen_http_addr),
{
match conf.http_auth_type {
AuthType::Trust => None,
@@ -83,9 +65,7 @@ impl PageServerNode {
}
}
.as_deref(),
ssl_ca_cert,
)
.expect("Client constructs with no errors"),
),
}
}
@@ -240,13 +220,6 @@ impl PageServerNode {
.context("write identity toml")?;
drop(identity_toml);
if self.env.generate_local_ssl_certs {
self.env.generate_ssl_cert(
datadir.join("server.crt").as_path(),
datadir.join("server.key").as_path(),
)?;
}
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
// Write metadata file, used by pageserver on startup to register itself with
@@ -257,15 +230,6 @@ impl PageServerNode {
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
let http_port = http_port.unwrap_or(9898);
let https_port = match self.conf.listen_https_addr.as_ref() {
Some(https_addr) => {
let (_https_host, https_port) =
parse_host_port(https_addr).expect("Unable to parse listen_https_addr");
Some(https_port.unwrap_or(9899))
}
None => None,
};
// Intentionally hand-craft JSON: this acts as an implicit format compat test
// in case the pageserver-side structure is edited, and reflects the real life
// situation: the metadata is written by some other script.
@@ -276,7 +240,6 @@ impl PageServerNode {
postgres_port: self.pg_connection_config.port(),
http_host: "localhost".to_string(),
http_port,
https_port,
other: HashMap::from([(
"availability_zone_id".to_string(),
serde_json::json!(az_id),

View File

@@ -111,18 +111,6 @@ impl SafekeeperNode {
.expect("non-Unicode path")
}
/// Initializes a safekeeper node by creating all necessary files,
/// e.g. SSL certificates.
pub fn initialize(&self) -> anyhow::Result<()> {
if self.env.generate_local_ssl_certs {
self.env.generate_ssl_cert(
&self.datadir_path().join("server.crt"),
&self.datadir_path().join("server.key"),
)?;
}
Ok(())
}
pub async fn start(
&self,
extra_opts: &[String],
@@ -208,16 +196,6 @@ impl SafekeeperNode {
]);
}
if let Some(https_port) = self.conf.https_port {
args.extend([
"--listen-https".to_owned(),
format!("{}:{}", self.listen_addr, https_port),
]);
}
if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
}
args.extend_from_slice(extra_opts);
background_process::start_process(

View File

@@ -12,10 +12,13 @@ use hyper0::Uri;
use nix::unistd::Pid;
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
TenantCreateResponse, TenantLocateResponse,
TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
TenantShardMigrateResponse,
};
use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
use pageserver_api::shard::TenantShardId;
use pageserver_api::models::{
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use reqwest::Method;
@@ -534,16 +537,8 @@ impl StorageController {
args.push("--start-as-candidate".to_string());
}
if self.config.use_https_pageserver_api {
args.push("--use-https-pageserver-api".to_string());
}
if self.config.use_https_safekeeper_api {
args.push("--use-https-safekeeper-api".to_string());
}
if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
if self.config.load_safekeepers {
args.push("--load-safekeepers".to_string());
}
if let Some(private_key) = &self.private_key {
@@ -562,8 +557,10 @@ impl StorageController {
args.push(format!("--public-key=\"{public_key}\""));
}
if let Some(control_plane_hooks_api) = &self.env.control_plane_hooks_api {
args.push(format!("--control-plane-url={control_plane_hooks_api}"));
if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
args.push(format!(
"--compute-hook-url={control_plane_compute_hook_api}"
));
}
if let Some(split_threshold) = self.config.split_threshold.as_ref() {
@@ -586,10 +583,6 @@ impl StorageController {
self.env.base_data_dir.display()
));
if self.config.timelines_onto_safekeepers {
args.push("--timelines-onto-safekeepers".to_string());
}
background_process::start_process(
COMMAND,
&instance_dir,
@@ -836,6 +829,41 @@ impl StorageController {
.await
}
#[instrument(skip(self))]
pub async fn tenant_migrate(
&self,
tenant_shard_id: TenantShardId,
node_id: NodeId,
) -> anyhow::Result<TenantShardMigrateResponse> {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(TenantShardMigrateRequest {
node_id,
migration_config: None,
}),
)
.await
}
#[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
pub async fn tenant_split(
&self,
tenant_id: TenantId,
new_shard_count: u8,
new_stripe_size: Option<ShardStripeSize>,
) -> anyhow::Result<TenantShardSplitResponse> {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/shard_split"),
Some(TenantShardSplitRequest {
new_shard_count,
new_stripe_size,
}),
)
.await
}
#[instrument(skip_all, fields(node_id=%req.node_id))]
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
@@ -880,9 +908,4 @@ impl StorageController {
)
.await
}
pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> anyhow::Result<()> {
self.dispatch(Method::PUT, "v1/tenant/config".to_string(), Some(req))
.await
}
}

View File

@@ -1,21 +1,20 @@
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
use clap::{Parser, Subcommand};
use futures::StreamExt;
use pageserver_api::controller_api::{
AvailabilityZone, MigrationConfig, NodeAvailabilityWrapper, NodeConfigureRequest,
NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse,
PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
TenantShardMigrateRequest, TenantShardMigrateResponse,
AvailabilityZone, NodeAvailabilityWrapper, NodeConfigureRequest, NodeDescribeResponse,
NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse, PlacementPolicy,
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest,
TenantDescribeResponse, TenantPolicyRequest, TenantShardMigrateRequest,
TenantShardMigrateResponse,
};
use pageserver_api::models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest,
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters,
TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest,
TenantShardSplitResponse,
};
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
@@ -113,15 +112,6 @@ enum Command {
tenant_shard_id: TenantShardId,
#[arg(long)]
node: NodeId,
#[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
prewarm: bool,
#[arg(long, default_value_t = false, action = clap::ArgAction::Set)]
override_scheduler: bool,
},
/// Watch the location of a tenant shard evolve, e.g. while expecting it to migrate
TenantShardWatch {
#[arg(long)]
tenant_shard_id: TenantShardId,
},
/// Migrate the secondary location for a tenant shard to a specific pageserver.
TenantShardMigrateSecondary {
@@ -158,6 +148,12 @@ enum Command {
#[arg(long)]
tenant_id: TenantId,
},
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
/// mode so that it can warm up content on a pageserver.
TenantWarmup {
#[arg(long)]
tenant_id: TenantId,
},
TenantSetPreferredAz {
#[arg(long)]
tenant_id: TenantId,
@@ -273,10 +269,6 @@ struct Cli {
/// a token with both scopes to use with this tool.
jwt: Option<String>,
#[arg(long)]
/// Trusted root CA certificate to use in https APIs.
ssl_ca_file: Option<PathBuf>,
#[command(subcommand)]
command: Command,
}
@@ -387,17 +379,9 @@ async fn main() -> anyhow::Result<()> {
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
let ssl_ca_cert = match &cli.ssl_ca_file {
Some(ssl_ca_file) => {
let buf = tokio::fs::read(ssl_ca_file).await?;
Some(reqwest::Certificate::from_pem(&buf)?)
}
None => None,
};
let mut trimmed = cli.api.to_string();
trimmed.pop();
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref(), ssl_ca_cert)?;
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
match cli.command {
Command::NodeRegister {
@@ -635,43 +619,19 @@ async fn main() -> anyhow::Result<()> {
Command::TenantShardMigrate {
tenant_shard_id,
node,
prewarm,
override_scheduler,
} => {
let migration_config = MigrationConfig {
prewarm,
override_scheduler,
..Default::default()
};
let req = TenantShardMigrateRequest {
node_id: node,
origin_node_id: None,
migration_config,
migration_config: None,
};
match storcon_client
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(req),
)
.await
{
Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) => {
anyhow::bail!(
"Migration to {node} rejected, may require `--force` ({}) ",
msg
);
}
Err(e) => return Err(e.into()),
Ok(_) => {}
}
watch_tenant_shard(storcon_client, tenant_shard_id, Some(node)).await?;
}
Command::TenantShardWatch { tenant_shard_id } => {
watch_tenant_shard(storcon_client, tenant_shard_id, None).await?;
.await?;
}
Command::TenantShardMigrateSecondary {
tenant_shard_id,
@@ -679,8 +639,7 @@ async fn main() -> anyhow::Result<()> {
} => {
let req = TenantShardMigrateRequest {
node_id: node,
origin_node_id: None,
migration_config: MigrationConfig::default(),
migration_config: None,
};
storcon_client
@@ -865,6 +824,94 @@ async fn main() -> anyhow::Result<()> {
)
.await?;
}
Command::TenantWarmup { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await;
match describe_response {
Ok(describe) => {
if matches!(describe.policy, PlacementPolicy::Secondary) {
// Fine: it's already known to controller in secondary mode: calling
// again to put it into secondary mode won't cause problems.
} else {
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
}
}
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
// Fine: this tenant isn't know to the storage controller yet.
}
Err(e) => {
// Unexpected API error
return Err(e.into());
}
}
vps_client
.location_config(
TenantShardId::unsharded(tenant_id),
pageserver_api::models::LocationConfig {
mode: pageserver_api::models::LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: 0,
shard_count: 0,
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
tenant_conf: TenantConfig::default(),
},
None,
true,
)
.await?;
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let secondary_ps_id = describe_response
.shards
.first()
.unwrap()
.node_secondary
.first()
.unwrap();
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
loop {
let (status, progress) = vps_client
.tenant_secondary_download(
TenantShardId::unsharded(tenant_id),
Some(Duration::from_secs(10)),
)
.await?;
println!(
"Progress: {}/{} layers, {}/{} bytes",
progress.layers_downloaded,
progress.layers_total,
progress.bytes_downloaded,
progress.bytes_total
);
match status {
StatusCode::OK => {
println!("Download complete");
break;
}
StatusCode::ACCEPTED => {
// Loop
}
_ => {
anyhow::bail!("Unexpected download status: {status}");
}
}
}
}
Command::TenantDrop { tenant_id, unclean } => {
if !unclean {
anyhow::bail!(
@@ -1058,8 +1105,7 @@ async fn main() -> anyhow::Result<()> {
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
Some(TenantShardMigrateRequest {
node_id: mv.to,
origin_node_id: Some(mv.from),
migration_config: MigrationConfig::default(),
migration_config: None,
}),
)
.await
@@ -1238,68 +1284,3 @@ async fn main() -> anyhow::Result<()> {
Ok(())
}
static WATCH_INTERVAL: Duration = Duration::from_secs(5);
async fn watch_tenant_shard(
storcon_client: Client,
tenant_shard_id: TenantShardId,
until_migrated_to: Option<NodeId>,
) -> anyhow::Result<()> {
if let Some(until_migrated_to) = until_migrated_to {
println!(
"Waiting for tenant shard {} to be migrated to node {}",
tenant_shard_id, until_migrated_to
);
}
loop {
let desc = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{}", tenant_shard_id.tenant_id),
None,
)
.await?;
// Output the current state of the tenant shard
let shard = desc
.shards
.iter()
.find(|s| s.tenant_shard_id == tenant_shard_id)
.ok_or(anyhow::anyhow!("Tenant shard not found"))?;
let summary = format!(
"attached: {} secondary: {} {}",
shard
.node_attached
.map(|n| format!("{}", n))
.unwrap_or("none".to_string()),
shard
.node_secondary
.iter()
.map(|n| n.to_string())
.collect::<Vec<_>>()
.join(","),
if shard.is_reconciling {
"(reconciler active)"
} else {
"(reconciler idle)"
}
);
println!("{}", summary);
// Maybe drop out if we finished migration
if let Some(until_migrated_to) = until_migrated_to {
if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling {
println!(
"Tenant shard {} is now on node {}",
tenant_shard_id, until_migrated_to
);
break;
}
}
tokio::time::sleep(WATCH_INTERVAL).await;
}
Ok(())
}

View File

@@ -27,10 +27,6 @@ yanked = "warn"
id = "RUSTSEC-2023-0071"
reason = "the marvin attack only affects private key decryption, not public key signature verification"
[[advisories.ignore]]
id = "RUSTSEC-2024-0436"
reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact."
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html

View File

@@ -1,201 +0,0 @@
# Sparse Keyspace for Relation Directories
## Summary
This is an RFC describing a new storage strategy for storing relation directories.
## Motivation
Postgres maintains a directory structure for databases and relations. In Neon, we store these information
by serializing the directory data in a single key (see `pgdatadir_mapping.rs`).
```rust
// DbDir:
// 00 00000000 00000000 00000000 00 00000000
// RelDir:
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
```
We have a dedicated structure on the ingestion path to serialize the relation directory into this single key.
```rust
#[derive(Debug, Serialize, Deserialize, Default)]
pub(crate) struct RelDirectory {
// Set of relations that exist. (relfilenode, forknum)
//
// TODO: Store it as a btree or radix tree or something else that spans multiple
// key-value pairs, if you have a lot of relations
pub(crate) rels: HashSet<(Oid, u8)>,
}
```
The current codebase has the following three access patterns for the relation directory.
1. Check if a relation exists.
2. List all relations.
3. Create/drop a relation.
For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the
hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get
and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back.
If we have 100k relations in a database, we would have a 100k-large hash set. Then, every
relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the
relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path,
we would have to deserialize this super big 100k-large key before checking if a single relation exists.
In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how
to seamlessly migrate users to use the new keyspace.
The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316).
## Key Mapping
We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in
[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>`
for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`),
into the key.
```plain
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists
```
Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be
implemented as follows.
1. Check if a relation exists: check if the key maps to "exists".
2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key.
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will
be removed during image layer generation upon compaction.
Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum.
The mapping is implemented as `rel_tag_sparse_key` in the PoC patch.
## Changes to Sparse Keyspace
Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir
information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs
to be updated accordingly to accommodate such "inherited sparse keys". This is done in
[PR#10313](https://github.com/neondatabase/neon/pull/10313).
## Coexistence of the Old and New Keyspaces
Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the
ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read
path needs to combine the data from both keyspaces.
Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the
new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration
process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the
migration can happen seamlessly and imposes no potential downtime for the user.
With the coexistence assumption, the 3 reldir operations will be implemented as follows:
1. Check if a relation exists
- Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly
return it to the user.
- Otherwise, deserialize the old reldir key and get the result.
2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key.
Combine them to obtain the final result.
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace.
- We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check.
- For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace.
- For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key,
remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace.
- The delete tombstone will be removed during image layer generation upon compaction.
This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total
amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction.
There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal
with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives
us `O(1)` complexity after fully opt-in the sparse keyspace.
The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible
to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN.
We will introduce a config item and an index_part record to record the current status of the migration process.
- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace.
- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace.
If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update
`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to
`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still
read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only:
once v2 is enabled, the user cannot go back to v1.
## Next Steps
### Full Migration
This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and
v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this
code path, we must ensure the timeline has no old reldir data.
We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces:
the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while
copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in
the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers`
process discovers the following keys at this LSN.
```plain
db1/reldir_key -> (table 1, table 2, table 3)
...db1 rel keys
db2/reldir_key -> (table 4, table 5, table 6)
...db2 rel keys
sparse_reldir_db2_table7 -> exists
sparse_reldir_db1_table8 -> deleted
```
It will generate the following keys:
```plain
db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`.
...db1 rel keys
db2/reldir_key -> ()
...db2 rel keys
-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180
sparse_reldir_db1_table1 -> exists
sparse_reldir_db1_table2 -> exists
sparse_reldir_db1_table3 -> exists
sparse_reldir_db2_table4 -> exists
sparse_reldir_db2_table5 -> exists
sparse_reldir_db2_table6 -> exists
sparse_reldir_db2_table7 -> exists
-- end image layer for the sparse keyspace at sparse_reldir_prefix+1
# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace.
# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there
# are no correctness issue.
```
We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before
we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images
above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or
in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to
`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we
don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers.
The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code.
### Consolidate Relation Size Keys
We have relsize at the end of all relation nodes.
```plain
// RelSize:
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
```
This means that computing logical size requires us to do several single-key gets across the keyspace,
potentially requiring downloading many layer files. We could consolidate them into a single
keyspace, improving logical size calculation performance.
### Migrate DBDir Keys
We assume the number of databases created by the users will be small, and therefore, the current way
of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into
the sparse keyspace to support large amount of databases.

View File

@@ -1,7 +1,3 @@
# Neon RFCs
## Overview
This directory contains Request for Comments documents, or RFCs, for
features or concepts that have been proposed. Alternative names:
technical design doc, ERD, one-pager
@@ -63,10 +59,37 @@ RFC lifecycle:
### RFC template
Use template with `YYYY-MM-DD-copy-me.md` as a starting point. Timestamp prefix helps to avoid awkward 'id' collisions.
```sh
cp docs/rfcs/YYYY-MM-DD-copy-me.md docs/rfcs/$(date +"%Y-%m-%d")-<name>.md
```
Note, a lot of the sections are marked as if relevant. They are included into the template as a reminder and to help inspiration.
```
# Name
Created on ..
Implemented on ..
## Summary
## Motivation
## Non Goals (if relevant)
## Impacted components (e.g. pageserver, safekeeper, console, etc)
## Proposed implementation
### Reliability, failure modes and corner cases (if relevant)
### Interaction/Sequence diagram (if relevant)
### Scalability (if relevant)
### Security implications (if relevant)
### Unresolved questions (if relevant)
## Alternative implementation (if relevant)
## Pros/cons of proposed approaches (if relevant)
## Definition of Done (if relevant)
```

View File

@@ -1,30 +0,0 @@
# Name
Created on YYYY-MM-DD
Implemented on _TBD_
## Summary
## Motivation
## Non Goals (if relevant)
## Impacted components (e.g. pageserver, safekeeper, console, etc)
## Proposed implementation
### Reliability, failure modes and corner cases (if relevant)
### Interaction/Sequence diagram (if relevant)
### Scalability (if relevant)
### Security implications (if relevant)
### Unresolved questions (if relevant)
## Alternative implementation (if relevant)
## Pros/cons of proposed approaches (if relevant)
## Definition of Done (if relevant)

View File

@@ -101,25 +101,15 @@ changes such as a pageserver node becoming unavailable, or the tenant's shard co
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
location changes.
The hook is configured using the storage controller's `--control-plane-url` CLI option, from which the hook URL is computed.
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
Currently, there is two hooks, each computed by appending the name to the provided control plane URL prefix:
- `notify-attach`, called whenever attachment for pageservers changes
- `notify-safekeepers`, called whenever attachment for safekeepers changes
If the hooks require JWT auth, the token may be provided with `--control-plane-jwt-token`.
The hooks will be invoked with a `PUT` request.
In the Neon cloud service, these hooks are implemented by Neon's internal cloud control plane. In `neon_local` systems,
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
the compute hook.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hooks. This is not complicated.
### `notify-attach` body
The `notify-attach` request body follows the format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
```
struct ComputeHookNotifyRequestShard {
@@ -138,15 +128,15 @@ When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.pageserver_connstring` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
shards identified by `NodeId` must be converted to the address+port of the node.
- if stripe_size is not None, set `neon.shard_stripe_size` to this value
- if stripe_size is not None, set `neon.stripe_size` to this value
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..
Example body:
### Example notification body
```
{
@@ -158,34 +148,3 @@ Example body:
],
}
```
### `notify-safekeepers` body
The `notify-safekeepers` request body forllows the format of the `SafekeepersNotifyRequest` structure, provided below for convenience.
```
pub struct SafekeeperInfo {
pub id: NodeId,
pub hostname: String,
}
pub struct SafekeepersNotifyRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub generation: u32,
pub safekeepers: Vec<SafekeeperInfo>,
}
```
When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.safekeeper_connstrings` to an array of postgres connection strings to safekeepers according to the `safekeepers` list. The
safekeepers identified by `NodeId` must be converted to the address+port of the respective safekeeper.
The hostname is provided for debugging purposes, so we reserve changes to how we pass it.
- set `neon.safekeepers_generation` to the provided `generation` value.
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..

View File

@@ -7,7 +7,6 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
chrono.workspace = true
indexmap.workspace = true
jsonwebtoken.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -30,9 +30,3 @@ pub struct SetRoleGrantsRequest {
pub privileges: Vec<Privilege>,
pub role: PgIdent,
}
/// Request of the /configure_telemetry API
#[derive(Debug, Deserialize, Serialize)]
pub struct ConfigureTelemetryRequest {
pub logs_export_host: Option<String>,
}

View File

@@ -139,7 +139,6 @@ pub struct ComputeCtlConfig {
/// Set of JSON web keys that the compute can use to authenticate
/// communication from the control plane.
pub jwks: JwkSet,
pub tls: Option<TlsConfig>,
}
impl Default for ComputeCtlConfig {
@@ -148,17 +147,10 @@ impl Default for ComputeCtlConfig {
jwks: JwkSet {
keys: Vec::default(),
},
tls: None,
}
}
}
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct TlsConfig {
pub key_path: String,
pub cert_path: String,
}
/// Response of the `/computes/{compute_id}/spec` control-plane API.
#[derive(Deserialize, Debug)]
pub struct ControlPlaneSpecResponse {

View File

@@ -5,15 +5,12 @@
//! and connect it to the storage nodes.
use std::collections::HashMap;
use indexmap::IndexMap;
use regex::Regex;
use remote_storage::RemotePath;
use serde::{Deserialize, Serialize};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use crate::responses::TlsConfig;
/// String type alias representing Postgres identifier and
/// intended to be used for DB / role names.
pub type PgIdent = String;
@@ -128,7 +125,7 @@ pub struct ComputeSpec {
// information about available remote extensions
pub remote_extensions: Option<RemoteExtSpec>,
pub pgbouncer_settings: Option<IndexMap<String, String>>,
pub pgbouncer_settings: Option<HashMap<String, String>>,
// Stripe size for pageserver sharding, in pages
#[serde(default)]
@@ -179,8 +176,8 @@ pub enum ComputeFeature {
/// track short-lived connections as user activity.
ActivityMonitorExperimental,
/// Allow to configure rsyslog for Postgres logs export
PostgresLogsExport,
/// Pre-install and initialize anon extension for every database in the cluster
AnonExtension,
/// This is a special feature flag that is used to represent unknown feature flags.
/// Basically all unknown to enum flags are represented as this one. See unit test
@@ -360,9 +357,6 @@ pub struct LocalProxySpec {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub jwks: Option<Vec<JwksSettings>>,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub tls: Option<TlsConfig>,
}
#[derive(Clone, Debug, Deserialize, Serialize)]

View File

@@ -208,6 +208,7 @@
],
"remote_extensions": {
"library_index": {
"anon": "anon",
"postgis-3": "postgis",
"libpgrouting-3.4": "postgis",
"postgis_raster-3": "postgis",
@@ -216,6 +217,12 @@
"address_standardizer-3": "postgis"
},
"extension_data": {
"anon": {
"archive_path": "5834329303/v15/extensions/anon.tar.zst",
"control_data": {
"anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n"
}
},
"postgis": {
"archive_path": "5834329303/v15/extensions/postgis.tar.zst",
"control_data": {
@@ -231,6 +238,7 @@
}
},
"custom_extensions": [
"anon"
],
"public_extensions": [
"postgis"

View File

@@ -7,9 +7,7 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
bytes.workspace = true
camino.workspace = true
fail.workspace = true
futures.workspace = true
hyper0.workspace = true
itertools.workspace = true
jemalloc_pprof.workspace = true
@@ -17,14 +15,12 @@ once_cell.workspace = true
pprof.workspace = true
regex.workspace = true
routerify.workspace = true
rustls-pemfile.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_path_to_error.workspace = true
thiserror.workspace = true
tracing.workspace = true
tokio.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
url.workspace = true
uuid.workspace = true

View File

@@ -399,10 +399,12 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
// Return the report in the requested format.
match format {
Format::Pprof => {
let body = report
let mut body = Vec::new();
report
.pprof()
.map_err(|err| ApiError::InternalServerError(err.into()))?
.encode_to_vec();
.write_to_vec(&mut body)
.map_err(|err| ApiError::InternalServerError(err.into()))?;
Response::builder()
.status(200)

View File

@@ -3,11 +3,9 @@ pub mod error;
pub mod failpoints;
pub mod json;
pub mod request;
pub mod server;
pub mod tls_certs;
extern crate hyper0 as hyper;
/// Current fast way to apply simple http routing in various Neon binaries.
/// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
pub use routerify::{RequestServiceBuilder, RouterBuilder, RouterService, ext::RequestExt};
pub use routerify::{RouterBuilder, RouterService, ext::RequestExt};

View File

@@ -1,155 +0,0 @@
use std::{error::Error, sync::Arc};
use futures::StreamExt;
use futures::stream::FuturesUnordered;
use hyper0::Body;
use hyper0::server::conn::Http;
use routerify::{RequestService, RequestServiceBuilder};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor;
use tokio_util::sync::CancellationToken;
use tracing::{error, info};
use crate::error::ApiError;
/// A simple HTTP server over hyper library.
/// You may want to use it instead of [`hyper0::server::Server`] because:
/// 1. hyper0's Server was removed from hyper v1.
/// It's recommended to replace hyepr0's Server with a manual loop, which is done here.
/// 2. hyper0's Server doesn't support TLS out of the box, and there is no way
/// to support it efficiently with the Accept trait that hyper0's Server uses.
/// That's one of the reasons why it was removed from v1.
/// <https://github.com/hyperium/hyper/blob/115339d3df50f20c8717680aa35f48858e9a6205/docs/ROADMAP.md#higher-level-client-and-server-problems>
pub struct Server {
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
listener: tokio::net::TcpListener,
tls_acceptor: Option<TlsAcceptor>,
}
impl Server {
pub fn new(
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
listener: std::net::TcpListener,
tls_acceptor: Option<TlsAcceptor>,
) -> anyhow::Result<Self> {
// Note: caller of from_std is responsible for setting nonblocking mode.
listener.set_nonblocking(true)?;
let listener = tokio::net::TcpListener::from_std(listener)?;
Ok(Self {
request_service,
listener,
tls_acceptor,
})
}
pub async fn serve(self, cancel: CancellationToken) -> anyhow::Result<()> {
fn suppress_io_error(err: &std::io::Error) -> bool {
use std::io::ErrorKind::*;
matches!(err.kind(), ConnectionReset | ConnectionAborted | BrokenPipe)
}
fn suppress_hyper_error(err: &hyper0::Error) -> bool {
if err.is_incomplete_message() || err.is_closed() || err.is_timeout() {
return true;
}
if let Some(inner) = err.source() {
if let Some(io) = inner.downcast_ref::<std::io::Error>() {
return suppress_io_error(io);
}
}
false
}
let mut connections = FuturesUnordered::new();
loop {
tokio::select! {
stream = self.listener.accept() => {
let (tcp_stream, remote_addr) = match stream {
Ok(stream) => stream,
Err(err) => {
if !suppress_io_error(&err) {
info!("Failed to accept TCP connection: {err:#}");
}
continue;
}
};
let service = self.request_service.build(remote_addr);
let tls_acceptor = self.tls_acceptor.clone();
let cancel = cancel.clone();
connections.push(tokio::spawn(
async move {
match tls_acceptor {
Some(tls_acceptor) => {
// Handle HTTPS connection.
let tls_stream = tokio::select! {
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
_ = cancel.cancelled() => return,
};
let tls_stream = match tls_stream {
Ok(tls_stream) => tls_stream,
Err(err) => {
if !suppress_io_error(&err) {
info!("Failed to accept TLS connection: {err:#}");
}
return;
}
};
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
if !suppress_hyper_error(&err) {
info!("Failed to serve HTTPS connection: {err:#}");
}
}
}
None => {
// Handle HTTP connection.
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
if !suppress_hyper_error(&err) {
info!("Failed to serve HTTP connection: {err:#}");
}
}
}
};
}));
}
Some(conn) = connections.next() => {
if let Err(err) = conn {
error!("Connection panicked: {err:#}");
}
}
_ = cancel.cancelled() => {
// Wait for graceful shutdown of all connections.
while let Some(conn) = connections.next().await {
if let Err(err) = conn {
error!("Connection panicked: {err:#}");
}
}
break;
}
}
}
Ok(())
}
/// Serves HTTP connection with graceful shutdown.
async fn serve_connection<I>(
io: I,
service: RequestService<Body, ApiError>,
cancel: CancellationToken,
) -> Result<(), hyper0::Error>
where
I: AsyncRead + AsyncWrite + Unpin + Send + 'static,
{
let mut conn = Http::new().serve_connection(io, service).with_upgrades();
tokio::select! {
res = &mut conn => res,
_ = cancel.cancelled() => {
Pin::new(&mut conn).graceful_shutdown();
// Note: connection should still be awaited for graceful shutdown to complete.
conn.await
}
}
}
}

View File

@@ -1,21 +0,0 @@
use camino::Utf8Path;
use tokio_rustls::rustls::pki_types::{CertificateDer, PrivateKeyDer};
pub fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
let file = std::fs::File::open(filename)?;
let mut reader = std::io::BufReader::new(file);
Ok(rustls_pemfile::certs(&mut reader).collect::<Result<Vec<_>, _>>()?)
}
pub fn load_private_key(filename: &Utf8Path) -> anyhow::Result<PrivateKeyDer<'static>> {
let file = std::fs::File::open(filename)?;
let mut reader = std::io::BufReader::new(file);
let key = rustls_pemfile::private_key(&mut reader)?;
key.ok_or(anyhow::anyhow!(
"no private key found in {}",
filename.as_str(),
))
}

View File

@@ -35,7 +35,6 @@ pub struct NodeMetadata {
pub postgres_port: u16,
pub http_host: String,
pub http_port: u16,
pub https_port: Option<u16>,
// Deployment tools may write fields to the metadata file beyond what we
// use in this type: this type intentionally only names fields that require.
@@ -58,9 +57,6 @@ pub struct ConfigToml {
// types mapped 1:1 into the runtime PageServerConfig type
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub ssl_key_file: Utf8PathBuf,
pub ssl_cert_file: Utf8PathBuf,
pub availability_zone: Option<String>,
#[serde(with = "humantime_serde")]
pub wait_lsn_timeout: Duration,
@@ -127,10 +123,6 @@ pub struct ConfigToml {
pub enable_read_path_debugging: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub validate_wal_contiguity: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub load_previous_heatmap: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub generate_unarchival_heatmap: Option<bool>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -272,16 +264,15 @@ pub struct TenantConfigToml {
/// size exceeds `compaction_upper_limit * checkpoint_distance`.
pub compaction_upper_limit: usize,
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
/// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
/// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
/// If true, compact down L0 across all tenant timelines before doing regular compaction.
pub compaction_l0_first: bool,
/// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only
/// has an effect if `compaction_l0_first` is true. Defaults to true.
/// has an effect if `compaction_l0_first` is `true`.
pub compaction_l0_semaphore: bool,
/// Level0 delta layer threshold at which to delay layer flushes such that they take 2x as long,
/// and block on layer flushes during ephemeral layer rolls, for compaction backpressure. This
/// helps compaction keep up with WAL ingestion, and avoids read amplification blowing up.
/// Should be >compaction_threshold. 0 to disable. Defaults to 3x compaction_threshold.
/// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
/// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
/// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
/// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default.
pub l0_flush_delay_threshold: Option<usize>,
/// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
/// to avoid deadlock. 0 to disable. Disabled by default.
@@ -289,8 +280,6 @@ pub struct TenantConfigToml {
/// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next
/// layer. This is a temporary backpressure mechanism which should be removed once
/// l0_flush_{delay,stall}_threshold is fully enabled.
///
/// TODO: this is no longer enabled, remove it when the config option is no longer set.
pub l0_flush_wait_upload: bool,
// Determines how much history is retained, to allow
// branching and read replicas at an older point in time.
@@ -428,9 +417,6 @@ pub mod defaults {
pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
utils::postgres_client::PostgresClientProtocol::Vanilla;
pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
}
impl Default for ConfigToml {
@@ -440,9 +426,6 @@ impl Default for ConfigToml {
Self {
listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
listen_https_addr: (None),
ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE),
ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE),
availability_zone: (None),
wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
.expect("cannot parse default wait lsn timeout")),
@@ -540,8 +523,6 @@ impl Default for ConfigToml {
None
},
validate_wal_contiguity: None,
load_previous_heatmap: None,
generate_unarchival_heatmap: None,
}
}
}
@@ -570,15 +551,13 @@ pub mod tenant_conf_defaults {
// be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
// with this config, we can get a maximum peak compaction usage of 9 GB.
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
// Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
// read amp.
pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
crate::models::CompactionAlgorithm::Legacy;
pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false;
pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true;
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
@@ -589,8 +568,9 @@ pub mod tenant_conf_defaults {
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
// If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
// layer creation will end immediately. Set to 0 to disable.
pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
// layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we
// want to enable this feature.
pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0;
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";

View File

@@ -16,30 +16,6 @@ fn test_node_metadata_v1_backward_compatibilty() {
postgres_port: 23,
http_host: "localhost".to_string(),
http_port: 42,
https_port: None,
other: HashMap::new(),
}
)
}
#[test]
fn test_node_metadata_v2_backward_compatibilty() {
let v2 = serde_json::to_vec(&serde_json::json!({
"host": "localhost",
"port": 23,
"http_host": "localhost",
"http_port": 42,
"https_port": 123,
}));
assert_eq!(
serde_json::from_slice::<NodeMetadata>(&v2.unwrap()).unwrap(),
NodeMetadata {
postgres_host: "localhost".to_string(),
postgres_port: 23,
http_host: "localhost".to_string(),
http_port: 42,
https_port: Some(123),
other: HashMap::new(),
}
)

View File

@@ -182,66 +182,20 @@ pub struct TenantDescribeResponseShard {
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
/// Optionally, callers may specify the node they are migrating _from_, and the server will
/// reject the request if the shard is no longer attached there: this enables writing safer
/// clients that don't risk fighting with some other movement of the shard.
#[serde(default)]
pub origin_node_id: Option<NodeId>,
#[serde(default)]
pub migration_config: MigrationConfig,
pub migration_config: Option<MigrationConfig>,
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
#[derive(Serialize, Deserialize, Debug)]
pub struct MigrationConfig {
/// If true, the migration will be executed even if it is to a location with a sub-optimal scheduling
/// score: this is usually not what you want, and if you use this then you'll also need to set the
/// tenant's scheduling policy to Essential or Pause to avoid the optimiser reverting your migration.
///
/// Default: false
#[serde(default)]
pub override_scheduler: bool,
/// If true, the migration will be done gracefully by creating a secondary location first and
/// waiting for it to warm up before cutting over. If false, if there is no existing secondary
/// location at the destination, the tenant will be migrated immediately. If the tenant's data
/// can't be downloaded within [`Self::secondary_warmup_timeout`], then the migration will go
/// ahead but run with a cold cache that can severely reduce performance until it warms up.
///
/// When doing a graceful migration, the migration API returns as soon as it is started.
///
/// Default: true
#[serde(default = "default_prewarm")]
pub prewarm: bool,
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
/// overall for secondary warmup before cutting over
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_warmup_timeout: Option<Duration>,
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
/// within each secondary download poll call to pageserver.
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_download_request_timeout: Option<Duration>,
}
fn default_prewarm() -> bool {
true
}
impl Default for MigrationConfig {
fn default() -> Self {
Self {
override_scheduler: false,
prewarm: default_prewarm(),
secondary_warmup_timeout: None,
secondary_download_request_timeout: None,
}
}
}
#[derive(Serialize, Clone, Debug)]
#[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability {
@@ -489,7 +443,6 @@ pub struct SafekeeperDescribeResponse {
pub host: String,
pub port: i32,
pub http_port: i32,
pub https_port: Option<i32>,
pub availability_zone_id: String,
pub scheduling_policy: SkSchedulingPolicy,
}
@@ -534,43 +487,4 @@ mod test {
err
);
}
/// Check that a minimal migrate request with no config results in the expected default settings
#[test]
fn test_migrate_request_decode_defaults() {
let json = r#"{
"node_id": 123
}"#;
let request: TenantShardMigrateRequest = serde_json::from_str(json).unwrap();
assert_eq!(request.node_id, NodeId(123));
assert_eq!(request.origin_node_id, None);
assert!(!request.migration_config.override_scheduler);
assert!(request.migration_config.prewarm);
assert_eq!(request.migration_config.secondary_warmup_timeout, None);
assert_eq!(
request.migration_config.secondary_download_request_timeout,
None
);
}
/// Check that a partially specified migration config results in the expected default settings
#[test]
fn test_migration_config_decode_defaults() {
// Specify just one field of the config
let json = r#"{
}"#;
let config: MigrationConfig = serde_json::from_str(json).unwrap();
// Check each field's expected default value
assert!(!config.override_scheduler);
assert!(config.prewarm);
assert_eq!(config.secondary_warmup_timeout, None);
assert_eq!(config.secondary_download_request_timeout, None);
assert_eq!(config.secondary_warmup_timeout, None);
// Consistency check that the Default impl agrees with our serde defaults
assert_eq!(MigrationConfig::default(), config);
}
}

View File

@@ -176,39 +176,6 @@ impl LsnLease {
}
}
/// Controls the detach ancestor behavior.
/// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point.
/// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all.
#[derive(Debug, Clone, Copy, Default)]
pub enum DetachBehavior {
#[default]
NoAncestorAndReparent,
MultiLevelAndNoReparent,
}
impl std::str::FromStr for DetachBehavior {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"no_ancestor_and_reparent" => Ok(DetachBehavior::NoAncestorAndReparent),
"multi_level_and_no_reparent" => Ok(DetachBehavior::MultiLevelAndNoReparent),
"v1" => Ok(DetachBehavior::NoAncestorAndReparent),
"v2" => Ok(DetachBehavior::MultiLevelAndNoReparent),
_ => Err("cannot parse detach behavior"),
}
}
}
impl std::fmt::Display for DetachBehavior {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
DetachBehavior::NoAncestorAndReparent => write!(f, "no_ancestor_and_reparent"),
DetachBehavior::MultiLevelAndNoReparent => write!(f, "multi_level_and_no_reparent"),
}
}
}
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
///
/// XXX: We used to have more variants here, but now it's just one, which makes this rather
@@ -307,31 +274,6 @@ pub struct TimelineCreateRequest {
pub mode: TimelineCreateRequestMode,
}
/// Storage controller specific extensions to [`TimelineInfo`].
#[derive(Serialize, Deserialize, Clone)]
pub struct TimelineCreateResponseStorcon {
#[serde(flatten)]
pub timeline_info: TimelineInfo,
pub safekeepers: Option<SafekeepersInfo>,
}
/// Safekeepers as returned in timeline creation request to storcon or pushed to
/// cplane in the post migration hook.
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeepersInfo {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub generation: u32,
pub safekeepers: Vec<SafekeeperInfo>,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeeperInfo {
pub id: NodeId,
pub hostname: String,
}
#[derive(Serialize, Deserialize, Clone)]
#[serde(untagged)]
pub enum TimelineCreateRequestMode {
@@ -1204,15 +1146,6 @@ pub struct TimelineArchivalConfigRequest {
pub state: TimelineArchivalState,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct TimelinePatchIndexPartRequest {
pub rel_size_migration: Option<RelSizeMigration>,
pub gc_compaction_last_completed_lsn: Option<Lsn>,
pub applied_gc_cutoff_lsn: Option<Lsn>,
#[serde(default)]
pub force_index_update: bool,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelinesInfoAndOffloaded {
pub timelines: Vec<TimelineInfo>,
@@ -1258,10 +1191,9 @@ pub struct TimelineInfo {
pub last_record_lsn: Lsn,
pub prev_record_lsn: Option<Lsn>,
/// Legacy field, retained for one version to enable old storage controller to
/// decode (it was a mandatory field).
#[serde(default, rename = "latest_gc_cutoff_lsn")]
pub _unused: Lsn,
/// Legacy field for compat with control plane. Synonym of `min_readable_lsn`.
/// TODO: remove once control plane no longer reads it.
pub latest_gc_cutoff_lsn: Lsn,
/// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
/// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
@@ -1510,14 +1442,8 @@ pub struct TenantScanRemoteStorageResponse {
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "snake_case")]
pub enum TenantSorting {
/// Total size of layers on local disk for all timelines in a shard.
ResidentSize,
/// The logical size of the largest timeline within a _tenant_ (not shard). Only tracked on
/// shard 0, contains the sum across all shards.
MaxLogicalSize,
/// The logical size of the largest timeline within a _tenant_ (not shard), divided by number of
/// shards. Only tracked on shard 0, and estimates the per-shard logical size.
MaxLogicalSizePerShard,
}
impl Default for TenantSorting {
@@ -1547,20 +1473,14 @@ pub struct TopTenantShardsRequest {
pub struct TopTenantShardItem {
pub id: TenantShardId,
/// Total size of layers on local disk for all timelines in this shard.
/// Total size of layers on local disk for all timelines in this tenant
pub resident_size: u64,
/// Total size of layers in remote storage for all timelines in this shard.
/// Total size of layers in remote storage for all timelines in this tenant
pub physical_size: u64,
/// The largest logical size of a timeline within this _tenant_ (not shard). This is only
/// tracked on shard 0, and contains the sum of the logical size across all shards.
/// The largest logical size of a timeline within this tenant
pub max_logical_size: u64,
/// The largest logical size of a timeline within this _tenant_ (not shard) divided by number of
/// shards. This is only tracked on shard 0, and is only an estimate as we divide it evenly by
/// shard count, rounded up.
pub max_logical_size_per_shard: u64,
}
#[derive(Serialize, Deserialize, Debug, Default)]

View File

@@ -112,16 +112,6 @@ impl ShardIdentity {
}
}
/// An unsharded identity with the given stripe size (if non-zero). This is typically used to
/// carry over a stripe size for an unsharded tenant from persistent storage.
pub fn unsharded_with_stripe_size(stripe_size: ShardStripeSize) -> Self {
let mut shard_identity = Self::unsharded();
if stripe_size.0 > 0 {
shard_identity.stripe_size = stripe_size;
}
shard_identity
}
/// A broken instance of this type is only used for `TenantState::Broken` tenants,
/// which are constructed in code paths that don't have access to proper configuration.
///

View File

@@ -396,14 +396,6 @@ pub mod waldecoder {
self.lsn + self.inputbuf.remaining() as u64
}
/// Returns the LSN up to which the WAL decoder has processed.
///
/// If [`Self::poll_decode`] returned a record, then this will return
/// the end LSN of said record.
pub fn lsn(&self) -> Lsn {
self.lsn
}
pub fn feed_bytes(&mut self, buf: &[u8]) {
self.inputbuf.extend_from_slice(buf);
}

View File

@@ -135,8 +135,8 @@ impl Type {
pub enum Kind {
/// A simple type like `VARCHAR` or `INTEGER`.
Simple,
/// An enumerated type.
Enum,
/// An enumerated type along with its variants.
Enum(Vec<String>),
/// A pseudo-type.
Pseudo,
/// An array type along with the type of its elements.
@@ -146,9 +146,9 @@ pub enum Kind {
/// A multirange type along with the type of its elements.
Multirange(Type),
/// A domain type along with its underlying type.
Domain(Oid),
/// A composite type.
Composite(Oid),
Domain(Type),
/// A composite type along with information about its fields.
Composite(Vec<Field>),
}
/// Information about a field of a composite type.

View File

@@ -19,10 +19,10 @@ use crate::config::{Host, SslMode};
use crate::connection::{Request, RequestMessages};
use crate::query::RowStream;
use crate::simple_query::SimpleQueryStream;
use crate::types::{Oid, Type};
use crate::types::{Oid, ToSql, Type};
use crate::{
CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Statement, Transaction,
TransactionBuilder, query, simple_query,
CancelToken, Error, ReadyForQueryStatus, Row, SimpleQueryMessage, Statement, Transaction,
TransactionBuilder, query, simple_query, slice_iter,
};
pub struct Responses {
@@ -54,18 +54,26 @@ impl Responses {
/// A cache of type info and prepared statements for fetching type info
/// (corresponding to the queries in the [crate::prepare] module).
#[derive(Default)]
pub(crate) struct CachedTypeInfo {
struct CachedTypeInfo {
/// A statement for basic information for a type from its
/// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
/// fallback).
pub(crate) typeinfo: Option<Statement>,
typeinfo: Option<Statement>,
/// A statement for getting information for a composite type from its OID.
/// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY).
typeinfo_composite: Option<Statement>,
/// A statement for getting information for a composite type from its OID.
/// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or
/// its fallback).
typeinfo_enum: Option<Statement>,
/// Cache of types already looked up.
pub(crate) types: HashMap<Oid, Type>,
types: HashMap<Oid, Type>,
}
pub struct InnerClient {
sender: mpsc::UnboundedSender<Request>,
cached_typeinfo: Mutex<CachedTypeInfo>,
/// A buffer to use when writing out postgres commands.
buffer: Mutex<BytesMut>,
@@ -83,6 +91,38 @@ impl InnerClient {
})
}
pub fn typeinfo(&self) -> Option<Statement> {
self.cached_typeinfo.lock().typeinfo.clone()
}
pub fn set_typeinfo(&self, statement: &Statement) {
self.cached_typeinfo.lock().typeinfo = Some(statement.clone());
}
pub fn typeinfo_composite(&self) -> Option<Statement> {
self.cached_typeinfo.lock().typeinfo_composite.clone()
}
pub fn set_typeinfo_composite(&self, statement: &Statement) {
self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone());
}
pub fn typeinfo_enum(&self) -> Option<Statement> {
self.cached_typeinfo.lock().typeinfo_enum.clone()
}
pub fn set_typeinfo_enum(&self, statement: &Statement) {
self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone());
}
pub fn type_(&self, oid: Oid) -> Option<Type> {
self.cached_typeinfo.lock().types.get(&oid).cloned()
}
pub fn set_type(&self, oid: Oid, type_: &Type) {
self.cached_typeinfo.lock().types.insert(oid, type_.clone());
}
/// Call the given function with a buffer to be used when writing out
/// postgres commands.
pub fn with_buf<F, R>(&self, f: F) -> R
@@ -102,6 +142,7 @@ pub struct SocketConfig {
pub host: Host,
pub port: u16,
pub connect_timeout: Option<Duration>,
// pub keepalive: Option<KeepaliveConfig>,
}
/// An asynchronous PostgreSQL client.
@@ -110,7 +151,6 @@ pub struct SocketConfig {
/// through this client object.
pub struct Client {
inner: Arc<InnerClient>,
cached_typeinfo: CachedTypeInfo,
socket_config: SocketConfig,
ssl_mode: SslMode,
@@ -129,9 +169,9 @@ impl Client {
Client {
inner: Arc::new(InnerClient {
sender,
cached_typeinfo: Default::default(),
buffer: Default::default(),
}),
cached_typeinfo: Default::default(),
socket_config,
ssl_mode,
@@ -149,6 +189,55 @@ impl Client {
&self.inner
}
/// Executes a statement, returning a vector of the resulting rows.
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
/// provided, 1-indexed.
///
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
/// with the `prepare` method.
///
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
pub async fn query(
&self,
statement: Statement,
params: &[&(dyn ToSql + Sync)],
) -> Result<Vec<Row>, Error> {
self.query_raw(statement, slice_iter(params))
.await?
.try_collect()
.await
}
/// The maximally flexible version of [`query`].
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
/// provided, 1-indexed.
///
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
/// with the `prepare` method.
///
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
///
/// [`query`]: #method.query
pub async fn query_raw<'a, I>(
&self,
statement: Statement,
params: I,
) -> Result<RowStream, Error>
where
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
I::IntoIter: ExactSizeIterator,
{
query::query(&self.inner, statement, params).await
}
/// Pass text directly to the Postgres backend to allow it to sort out typing itself and
/// to save a roundtrip
pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
@@ -195,14 +284,6 @@ impl Client {
simple_query::batch_execute(self.inner(), query).await
}
pub async fn discard_all(&mut self) -> Result<ReadyForQueryStatus, Error> {
// clear the prepared statements that are about to be nuked from the postgres session
self.cached_typeinfo.typeinfo = None;
self.batch_execute("discard all").await
}
/// Begins a new database transaction.
///
/// The transaction will roll back by default - use the `commit` method to commit it.
@@ -266,8 +347,8 @@ impl Client {
}
/// Query for type information
pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result<Type, Error> {
crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await
pub async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
crate::prepare::get_type(&self.inner, oid).await
}
/// Determines if the connection to the server has already closed.

View File

@@ -22,7 +22,7 @@ pub trait GenericClient: private::Sealed {
I::IntoIter: ExactSizeIterator + Sync + Send;
/// Query for type information
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error>;
async fn get_type(&self, oid: Oid) -> Result<Type, Error>;
}
impl private::Sealed for Client {}
@@ -38,8 +38,8 @@ impl GenericClient for Client {
}
/// Query for type information
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
self.get_type_inner(oid).await
async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
crate::prepare::get_type(self.inner(), oid).await
}
}
@@ -56,7 +56,7 @@ impl GenericClient for Transaction<'_> {
}
/// Query for type information
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
self.client_mut().get_type(oid).await
async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
self.client().get_type(oid).await
}
}

View File

@@ -9,10 +9,10 @@ use log::debug;
use postgres_protocol2::message::backend::Message;
use postgres_protocol2::message::frontend;
use crate::client::{CachedTypeInfo, InnerClient};
use crate::client::InnerClient;
use crate::codec::FrontendMessage;
use crate::connection::RequestMessages;
use crate::types::{Kind, Oid, Type};
use crate::types::{Field, Kind, Oid, Type};
use crate::{Column, Error, Statement, query, slice_iter};
pub(crate) const TYPEINFO_QUERY: &str = "\
@@ -23,7 +23,23 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
WHERE t.oid = $1
";
async fn prepare_typecheck(
const TYPEINFO_ENUM_QUERY: &str = "\
SELECT enumlabel
FROM pg_catalog.pg_enum
WHERE enumtypid = $1
ORDER BY enumsortorder
";
pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
SELECT attname, atttypid
FROM pg_catalog.pg_attribute
WHERE attrelid = $1
AND NOT attisdropped
AND attnum > 0
ORDER BY attnum
";
pub async fn prepare(
client: &Arc<InnerClient>,
name: &'static str,
query: &str,
@@ -51,7 +67,7 @@ async fn prepare_typecheck(
let mut parameters = vec![];
let mut it = parameter_description.parameters();
while let Some(oid) = it.next().map_err(Error::parse)? {
let type_ = Type::from_oid(oid).ok_or_else(Error::unexpected_message)?;
let type_ = get_type(client, oid).await?;
parameters.push(type_);
}
@@ -59,7 +75,7 @@ async fn prepare_typecheck(
if let Some(row_description) = row_description {
let mut it = row_description.fields();
while let Some(field) = it.next().map_err(Error::parse)? {
let type_ = Type::from_oid(field.type_oid()).ok_or_else(Error::unexpected_message)?;
let type_ = get_type(client, field.type_oid()).await?;
let column = Column::new(field.name().to_string(), type_, field);
columns.push(column);
}
@@ -68,6 +84,15 @@ async fn prepare_typecheck(
Ok(Statement::new(client, name, parameters, columns))
}
fn prepare_rec<'a>(
client: &'a Arc<InnerClient>,
name: &'static str,
query: &'a str,
types: &'a [Type],
) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
Box::pin(prepare(client, name, query, types))
}
fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
if types.is_empty() {
debug!("preparing query {}: {}", name, query);
@@ -83,20 +108,16 @@ fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Resu
})
}
pub async fn get_type(
client: &Arc<InnerClient>,
typecache: &mut CachedTypeInfo,
oid: Oid,
) -> Result<Type, Error> {
pub async fn get_type(client: &Arc<InnerClient>, oid: Oid) -> Result<Type, Error> {
if let Some(type_) = Type::from_oid(oid) {
return Ok(type_);
}
if let Some(type_) = typecache.types.get(&oid) {
return Ok(type_.clone());
};
if let Some(type_) = client.type_(oid) {
return Ok(type_);
}
let stmt = typeinfo_statement(client, typecache).await?;
let stmt = typeinfo_statement(client).await?;
let rows = query::query(client, stmt, slice_iter(&[&oid])).await?;
pin_mut!(rows);
@@ -115,48 +136,100 @@ pub async fn get_type(
let relid: Oid = row.try_get(6)?;
let kind = if type_ == b'e' as i8 {
Kind::Enum
let variants = get_enum_variants(client, oid).await?;
Kind::Enum(variants)
} else if type_ == b'p' as i8 {
Kind::Pseudo
} else if basetype != 0 {
Kind::Domain(basetype)
let type_ = get_type_rec(client, basetype).await?;
Kind::Domain(type_)
} else if elem_oid != 0 {
let type_ = get_type_rec(client, typecache, elem_oid).await?;
let type_ = get_type_rec(client, elem_oid).await?;
Kind::Array(type_)
} else if relid != 0 {
Kind::Composite(relid)
let fields = get_composite_fields(client, relid).await?;
Kind::Composite(fields)
} else if let Some(rngsubtype) = rngsubtype {
let type_ = get_type_rec(client, typecache, rngsubtype).await?;
let type_ = get_type_rec(client, rngsubtype).await?;
Kind::Range(type_)
} else {
Kind::Simple
};
let type_ = Type::new(name, oid, kind, schema);
typecache.types.insert(oid, type_.clone());
client.set_type(oid, &type_);
Ok(type_)
}
fn get_type_rec<'a>(
client: &'a Arc<InnerClient>,
typecache: &'a mut CachedTypeInfo,
oid: Oid,
) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + 'a>> {
Box::pin(get_type(client, typecache, oid))
Box::pin(get_type(client, oid))
}
async fn typeinfo_statement(
client: &Arc<InnerClient>,
typecache: &mut CachedTypeInfo,
) -> Result<Statement, Error> {
if let Some(stmt) = &typecache.typeinfo {
return Ok(stmt.clone());
async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
if let Some(stmt) = client.typeinfo() {
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo";
let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
typecache.typeinfo = Some(stmt.clone());
client.set_typeinfo(&stmt);
Ok(stmt)
}
async fn get_enum_variants(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<String>, Error> {
let stmt = typeinfo_enum_statement(client).await?;
query::query(client, stmt, slice_iter(&[&oid]))
.await?
.and_then(|row| async move { row.try_get(0) })
.try_collect()
.await
}
async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
if let Some(stmt) = client.typeinfo_enum() {
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo_enum";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?;
client.set_typeinfo_enum(&stmt);
Ok(stmt)
}
async fn get_composite_fields(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<Field>, Error> {
let stmt = typeinfo_composite_statement(client).await?;
let rows = query::query(client, stmt, slice_iter(&[&oid]))
.await?
.try_collect::<Vec<_>>()
.await?;
let mut fields = vec![];
for row in rows {
let name = row.try_get(0)?;
let oid = row.try_get(1)?;
let type_ = get_type_rec(client, oid).await?;
fields.push(Field::new(name, type_));
}
Ok(fields)
}
async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
if let Some(stmt) = client.typeinfo_composite() {
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo_composite";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
client.set_typeinfo_composite(&stmt);
Ok(stmt)
}

View File

@@ -72,9 +72,4 @@ impl<'a> Transaction<'a> {
pub fn client(&self) -> &Client {
self.client
}
/// Returns a reference to the underlying `Client`.
pub fn client_mut(&mut self) -> &mut Client {
self.client
}
}

View File

@@ -131,14 +131,6 @@ impl Configuration {
}
}
pub fn new(members: MemberSet) -> Self {
Configuration {
generation: INITIAL_GENERATION,
members,
new_members: None,
}
}
/// Is `sk_id` member of the configuration?
pub fn contains(&self, sk_id: NodeId) -> bool {
self.members.contains(sk_id) || self.new_members.as_ref().is_some_and(|m| m.contains(sk_id))

View File

@@ -18,7 +18,7 @@ pub struct SafekeeperStatus {
pub id: NodeId,
}
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
@@ -221,11 +221,6 @@ pub struct TimelineMembershipSwitchResponse {
pub current_conf: Configuration,
}
#[derive(Clone, Copy, Serialize, Deserialize)]
pub struct TimelineDeleteResult {
pub dir_existed: bool,
}
fn lsn_invalid() -> Lsn {
Lsn::INVALID
}
@@ -288,7 +283,7 @@ pub struct SafekeeperUtilization {
}
/// pull_timeline request body.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize)]
pub struct PullTimelineRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,

View File

@@ -21,7 +21,7 @@
//! .with_writer(std::io::stderr);
//!
//! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
//! let otlp_layer = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default()).await;
//! let otlp_layer = tracing_utils::init_tracing("my_application").await;
//!
//! // Put it all together
//! tracing_subscriber::registry()
@@ -38,12 +38,8 @@ pub mod http;
use opentelemetry::KeyValue;
use opentelemetry::trace::TracerProvider;
use opentelemetry_otlp::WithExportConfig;
pub use opentelemetry_otlp::{ExportConfig, Protocol};
use tracing::level_filters::LevelFilter;
use tracing::{Dispatch, Subscriber};
use tracing::Subscriber;
use tracing_subscriber::Layer;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::registry::LookupSpan;
/// Set up OpenTelemetry exporter, using configuration from environment variables.
@@ -73,28 +69,19 @@ use tracing_subscriber::registry::LookupSpan;
///
/// This doesn't block, but is marked as 'async' to hint that this must be called in
/// asynchronous execution context.
pub async fn init_tracing<S>(
service_name: &str,
export_config: ExportConfig,
) -> Option<impl Layer<S>>
pub async fn init_tracing<S>(service_name: &str) -> Option<impl Layer<S>>
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
return None;
};
Some(init_tracing_internal(
service_name.to_string(),
export_config,
))
Some(init_tracing_internal(service_name.to_string()))
}
/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
/// tasks.
pub fn init_tracing_without_runtime<S>(
service_name: &str,
export_config: ExportConfig,
) -> Option<impl Layer<S>>
pub fn init_tracing_without_runtime<S>(service_name: &str) -> Option<impl Layer<S>>
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
@@ -125,22 +112,16 @@ where
));
let _guard = runtime.enter();
Some(init_tracing_internal(
service_name.to_string(),
export_config,
))
Some(init_tracing_internal(service_name.to_string()))
}
fn init_tracing_internal<S>(service_name: String, export_config: ExportConfig) -> impl Layer<S>
fn init_tracing_internal<S>(service_name: String) -> impl Layer<S>
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
// Sets up exporter from the provided [`ExportConfig`] parameter.
// If the endpoint is not specified, it is loaded from the
// OTEL_EXPORTER_OTLP_ENDPOINT environment variable.
// Sets up exporter from the OTEL_EXPORTER_* environment variables.
let exporter = opentelemetry_otlp::SpanExporter::builder()
.with_http()
.with_export_config(export_config)
.build()
.expect("could not initialize opentelemetry exporter");
@@ -170,51 +151,3 @@ where
pub fn shutdown_tracing() {
opentelemetry::global::shutdown_tracer_provider();
}
pub enum OtelEnablement {
Disabled,
Enabled {
service_name: String,
export_config: ExportConfig,
runtime: &'static tokio::runtime::Runtime,
},
}
pub struct OtelGuard {
pub dispatch: Dispatch,
}
impl Drop for OtelGuard {
fn drop(&mut self) {
shutdown_tracing();
}
}
/// Initializes OTEL infrastructure for performance tracing according to the provided configuration
///
/// Performance tracing is handled by a different [`tracing::Subscriber`]. This functions returns
/// an [`OtelGuard`] containing a [`tracing::Dispatch`] associated with a newly created subscriber.
/// Applications should use this dispatch for their performance traces.
///
/// The lifetime of the guard should match taht of the application. On drop, it tears down the
/// OTEL infra.
pub fn init_performance_tracing(otel_enablement: OtelEnablement) -> Option<OtelGuard> {
let otel_subscriber = match otel_enablement {
OtelEnablement::Disabled => None,
OtelEnablement::Enabled {
service_name,
export_config,
runtime,
} => {
let otel_layer = runtime
.block_on(init_tracing(&service_name, export_config))
.with_filter(LevelFilter::INFO);
let otel_subscriber = tracing_subscriber::registry().with(otel_layer);
let otel_dispatch = Dispatch::new(otel_subscriber);
Some(otel_dispatch)
}
};
otel_subscriber.map(|dispatch| OtelGuard { dispatch })
}

View File

@@ -42,7 +42,6 @@ toml_edit = { workspace = true, features = ["serde"] }
tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
tracing-utils.workspace = true
rand.workspace = true
scopeguard.workspace = true
strum.workspace = true

View File

@@ -49,13 +49,7 @@ pub fn bench_log_slow(c: &mut Criterion) {
// performance too. Use a simple noop future that yields once, to avoid any scheduler fast
// paths for a ready future.
if enabled {
b.iter(|| {
runtime.block_on(log_slow(
"ready",
THRESHOLD,
std::pin::pin!(tokio::task::yield_now()),
))
});
b.iter(|| runtime.block_on(log_slow("ready", THRESHOLD, tokio::task::yield_now())));
} else {
b.iter(|| runtime.block_on(tokio::task::yield_now()));
}

View File

@@ -165,7 +165,6 @@ pub fn init(
};
log_layer.with_filter(rust_log_env_filter())
});
let r = r.with(
TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
);
@@ -331,90 +330,37 @@ impl std::fmt::Debug for SecretString {
///
/// TODO: consider upgrading this to a warning, but currently it fires too often.
#[inline]
pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
where
F: Future<Output = O>,
{
monitor_slow_future(
threshold,
threshold, // period = threshold
f,
|MonitorSlowFutureCallback {
ready,
is_slow,
elapsed_total,
elapsed_since_last_callback: _,
}| {
if !is_slow {
return;
}
if ready {
info!(
"slow {name} completed after {:.3}s",
elapsed_total.as_secs_f64()
);
} else {
info!(
"slow {name} still running after {:.3}s",
elapsed_total.as_secs_f64()
);
}
},
)
.await
}
pub async fn log_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
// TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
// won't fit on the stack.
let mut f = Box::pin(f);
/// Poll future `fut` to completion, invoking callback `cb` at the given `threshold` and every
/// `period` afterwards, and also unconditionally when the future completes.
#[inline]
pub async fn monitor_slow_future<F, O>(
threshold: Duration,
period: Duration,
mut fut: std::pin::Pin<&mut F>,
mut cb: impl FnMut(MonitorSlowFutureCallback),
) -> O
where
F: Future<Output = O>,
{
let started = Instant::now();
let mut attempt = 1;
let mut last_cb = started;
loop {
// NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common
// case where the timeout doesn't fire.
let deadline = started + threshold + (attempt - 1) * period;
// TODO: still call the callback if the future panics? Copy how we do it for the page_service flush_in_progress counter.
let res = tokio::time::timeout_at(deadline, &mut fut).await;
let now = Instant::now();
let elapsed_total = now - started;
cb(MonitorSlowFutureCallback {
ready: res.is_ok(),
is_slow: elapsed_total >= threshold,
elapsed_total,
elapsed_since_last_callback: now - last_cb,
});
last_cb = now;
if let Ok(output) = res {
let deadline = started + attempt * threshold;
if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await {
// NB: we check if we exceeded the threshold even if the timeout never fired, because
// scheduling or execution delays may cause the future to succeed even if it exceeds the
// timeout. This costs an extra unconditional clock reading, but seems worth it to avoid
// false negatives.
let elapsed = started.elapsed();
if elapsed >= threshold {
info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
}
return output;
}
let elapsed = started.elapsed().as_secs_f64();
info!("slow {name} still running after {elapsed:.3}s",);
attempt += 1;
}
}
/// See [`monitor_slow_future`].
pub struct MonitorSlowFutureCallback {
/// Whether the future completed. If true, there will be no more callbacks.
pub ready: bool,
/// Whether the future is taking `>=` the specififed threshold duration to complete.
/// Monotonic: if true in one callback invocation, true in all subsequent onces.
pub is_slow: bool,
/// The time elapsed since the [`monitor_slow_future`] was first polled.
pub elapsed_total: Duration,
/// The time elapsed since the last callback invocation.
/// For the initial callback invocation, the time elapsed since the [`monitor_slow_future`] was first polled.
pub elapsed_since_last_callback: Duration,
}
#[cfg(test)]
mod tests {
use metrics::IntCounterVec;

View File

@@ -48,7 +48,6 @@ pprof.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true
rustls.workspace = true
scopeguard.workspace = true
send-future.workspace = true
serde.workspace = true
@@ -63,12 +62,10 @@ tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util"
tokio-epoll-uring.workspace = true
tokio-io-timeout.workspace = true
tokio-postgres.workspace = true
tokio-rustls.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
toml_edit = { workspace = true, features = [ "serde" ] }
tracing.workspace = true
tracing-utils.workspace = true
url.workspace = true
walkdir.workspace = true
metrics.workspace = true
@@ -101,7 +98,6 @@ criterion.workspace = true
hex-literal.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
indoc.workspace = true
uuid.workspace = true
[[bench]]
name = "bench_layer_map"
@@ -119,10 +115,6 @@ harness = false
name = "upload_queue"
harness = false
[[bench]]
name = "bench_metrics"
harness = false
[[bin]]
name = "test_helper_slow_client_reads"
required-features = [ "testing" ]

View File

@@ -1,366 +0,0 @@
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use utils::id::{TenantId, TimelineId};
//
// Demonstrates that repeat label values lookup is a multicore scalability bottleneck
// that is worth avoiding.
//
criterion_group!(
label_values,
label_values::bench_naive_usage,
label_values::bench_cache_label_values_lookup
);
mod label_values {
use super::*;
pub fn bench_naive_usage(c: &mut Criterion) {
let mut g = c.benchmark_group("label_values__naive_usage");
for ntimelines in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("ntimelines", ntimelines),
&ntimelines,
|b, ntimelines| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*ntimelines + 1);
let timelines = (0..*ntimelines)
.map(|_| {
(
TenantId::generate().to_string(),
"0000".to_string(),
TimelineId::generate().to_string(),
)
})
.collect::<Vec<_>>();
let metric_vec = metrics::UIntGaugeVec::new(
metrics::opts!("testmetric", "testhelp"),
&["tenant_id", "shard_id", "timeline_id"],
)
.unwrap();
std::thread::scope(|s| {
for (tenant_id, shard_id, timeline_id) in &timelines {
s.spawn(|| {
barrier.wait();
for _ in 0..iters {
metric_vec
.with_label_values(&[tenant_id, shard_id, timeline_id])
.inc();
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
pub fn bench_cache_label_values_lookup(c: &mut Criterion) {
let mut g = c.benchmark_group("label_values__cache_label_values_lookup");
for ntimelines in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("ntimelines", ntimelines),
&ntimelines,
|b, ntimelines| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*ntimelines + 1);
let timelines = (0..*ntimelines)
.map(|_| {
(
TenantId::generate().to_string(),
"0000".to_string(),
TimelineId::generate().to_string(),
)
})
.collect::<Vec<_>>();
let metric_vec = metrics::UIntGaugeVec::new(
metrics::opts!("testmetric", "testhelp"),
&["tenant_id", "shard_id", "timeline_id"],
)
.unwrap();
std::thread::scope(|s| {
for (tenant_id, shard_id, timeline_id) in &timelines {
s.spawn(|| {
let metric = metric_vec.with_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
barrier.wait();
for _ in 0..iters {
metric.inc();
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
}
//
// Demonstrates that even a single metric can be a scalability bottleneck
// if multiple threads in it concurrently but there's nothing we can do
// about it without changing the metrics framework to use e.g. sharded counte atomics.
//
criterion_group!(
single_metric_multicore_scalability,
single_metric_multicore_scalability::bench,
);
mod single_metric_multicore_scalability {
use super::*;
pub fn bench(c: &mut Criterion) {
let mut g = c.benchmark_group("single_metric_multicore_scalability");
for nthreads in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("nthreads", nthreads),
&nthreads,
|b, nthreads| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*nthreads + 1);
let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
std::thread::scope(|s| {
for _ in 0..*nthreads {
s.spawn(|| {
barrier.wait();
for _ in 0..iters {
metric.inc();
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
}
//
// Demonstrates that even if we cache label value, the propagation of such a cached metric value
// by Clone'ing it is a scalability bottleneck.
// The reason is that it's an Arc internally and thus there's contention on the reference count atomics.
//
// We can avoid that by having long-lived references per thread (= indirection).
//
criterion_group!(
propagation_of_cached_label_value,
propagation_of_cached_label_value::bench_naive,
propagation_of_cached_label_value::bench_long_lived_reference_per_thread,
);
mod propagation_of_cached_label_value {
use std::sync::Arc;
use super::*;
pub fn bench_naive(c: &mut Criterion) {
let mut g = c.benchmark_group("propagation_of_cached_label_value__naive");
for nthreads in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("nthreads", nthreads),
&nthreads,
|b, nthreads| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*nthreads + 1);
let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
std::thread::scope(|s| {
for _ in 0..*nthreads {
s.spawn(|| {
barrier.wait();
for _ in 0..iters {
// propagating the metric means we'd clone it into the child RequestContext
let propagated = metric.clone();
// simulate some work
criterion::black_box(propagated);
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
pub fn bench_long_lived_reference_per_thread(c: &mut Criterion) {
let mut g =
c.benchmark_group("propagation_of_cached_label_value__long_lived_reference_per_thread");
for nthreads in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("nthreads", nthreads),
&nthreads,
|b, nthreads| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*nthreads + 1);
let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
std::thread::scope(|s| {
for _ in 0..*nthreads {
s.spawn(|| {
// This is the technique.
let this_threads_metric_reference = Arc::new(metric.clone());
barrier.wait();
for _ in 0..iters {
// propagating the metric means we'd clone it into the child RequestContext
let propagated = Arc::clone(&this_threads_metric_reference);
// simulate some work (include the pointer chase!)
criterion::black_box(&*propagated);
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
}
}
criterion_main!(
label_values,
single_metric_multicore_scalability,
propagation_of_cached_label_value
);
/*
RUST_BACKTRACE=full cargo bench --bench bench_metrics -- --discard-baseline --noplot
Results on an im4gn.2xlarge instance
label_values__naive_usage/ntimelines/1 time: [178.71 ns 178.74 ns 178.76 ns]
label_values__naive_usage/ntimelines/4 time: [532.94 ns 539.59 ns 546.31 ns]
label_values__naive_usage/ntimelines/8 time: [1.1082 µs 1.1109 µs 1.1135 µs]
label_values__cache_label_values_lookup/ntimelines/1 time: [6.4116 ns 6.4119 ns 6.4123 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [6.3482 ns 6.3819 ns 6.4079 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [6.4213 ns 6.5279 ns 6.6293 ns]
single_metric_multicore_scalability/nthreads/1 time: [6.0102 ns 6.0104 ns 6.0106 ns]
single_metric_multicore_scalability/nthreads/4 time: [38.127 ns 38.275 ns 38.416 ns]
single_metric_multicore_scalability/nthreads/8 time: [73.698 ns 74.882 ns 75.864 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [14.424 ns 14.425 ns 14.426 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [100.71 ns 102.53 ns 104.35 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [211.50 ns 214.44 ns 216.87 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.135 ns 14.147 ns 14.160 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.243 ns 14.255 ns 14.268 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [14.470 ns 14.682 ns 14.895 ns]
Results on an i3en.3xlarge instance
label_values__naive_usage/ntimelines/1 time: [117.32 ns 117.53 ns 117.74 ns]
label_values__naive_usage/ntimelines/4 time: [736.58 ns 741.12 ns 745.61 ns]
label_values__naive_usage/ntimelines/8 time: [1.4513 µs 1.4596 µs 1.4665 µs]
label_values__cache_label_values_lookup/ntimelines/1 time: [8.0964 ns 8.0979 ns 8.0995 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [8.1620 ns 8.2912 ns 8.4491 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [14.148 ns 14.237 ns 14.324 ns]
single_metric_multicore_scalability/nthreads/1 time: [8.0993 ns 8.1013 ns 8.1046 ns]
single_metric_multicore_scalability/nthreads/4 time: [80.039 ns 80.672 ns 81.297 ns]
single_metric_multicore_scalability/nthreads/8 time: [153.58 ns 154.23 ns 154.90 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [13.924 ns 13.926 ns 13.928 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [143.66 ns 145.27 ns 146.59 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [296.51 ns 297.90 ns 299.30 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.013 ns 14.149 ns 14.308 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.311 ns 14.625 ns 14.984 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [25.981 ns 26.227 ns 26.476 ns]
Results on an Standard L16s v3 (16 vcpus, 128 GiB memory) Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
label_values__naive_usage/ntimelines/1 time: [101.63 ns 101.84 ns 102.06 ns]
label_values__naive_usage/ntimelines/4 time: [417.55 ns 424.73 ns 432.63 ns]
label_values__naive_usage/ntimelines/8 time: [874.91 ns 889.51 ns 904.25 ns]
label_values__cache_label_values_lookup/ntimelines/1 time: [5.7724 ns 5.7760 ns 5.7804 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [7.8878 ns 7.9401 ns 8.0034 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [7.2621 ns 7.6354 ns 8.0337 ns]
single_metric_multicore_scalability/nthreads/1 time: [5.7710 ns 5.7744 ns 5.7785 ns]
single_metric_multicore_scalability/nthreads/4 time: [66.629 ns 66.994 ns 67.336 ns]
single_metric_multicore_scalability/nthreads/8 time: [130.85 ns 131.98 ns 132.91 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [11.540 ns 11.546 ns 11.553 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [131.22 ns 131.90 ns 132.56 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [260.99 ns 262.75 ns 264.26 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [11.544 ns 11.550 ns 11.557 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [11.568 ns 11.642 ns 11.763 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [13.416 ns 14.121 ns 14.886 ns
Results on an M4 MAX MacBook Pro Total Number of Cores: 14 (10 performance and 4 efficiency)
label_values__naive_usage/ntimelines/1 time: [52.711 ns 53.026 ns 53.381 ns]
label_values__naive_usage/ntimelines/4 time: [323.99 ns 330.40 ns 337.53 ns]
label_values__naive_usage/ntimelines/8 time: [1.1615 µs 1.1998 µs 1.2399 µs]
label_values__cache_label_values_lookup/ntimelines/1 time: [1.6635 ns 1.6715 ns 1.6809 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [1.7786 ns 1.7876 ns 1.8028 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [1.8195 ns 1.8371 ns 1.8665 ns]
single_metric_multicore_scalability/nthreads/1 time: [1.7764 ns 1.7909 ns 1.8079 ns]
single_metric_multicore_scalability/nthreads/4 time: [33.875 ns 34.868 ns 35.923 ns]
single_metric_multicore_scalability/nthreads/8 time: [226.85 ns 235.30 ns 244.18 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [3.4337 ns 3.4491 ns 3.4660 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [69.486 ns 71.937 ns 74.472 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [434.87 ns 456.47 ns 477.84 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [3.3767 ns 3.3974 ns 3.4220 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [3.6105 ns 4.2355 ns 5.1463 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [4.0889 ns 4.9714 ns 6.0779 ns]
Results on a Hetzner AX102 AMD Ryzen 9 7950X3D 16-Core Processor
label_values__naive_usage/ntimelines/1 time: [64.510 ns 64.559 ns 64.610 ns]
label_values__naive_usage/ntimelines/4 time: [309.71 ns 326.09 ns 342.32 ns]
label_values__naive_usage/ntimelines/8 time: [776.92 ns 819.35 ns 856.93 ns]
label_values__cache_label_values_lookup/ntimelines/1 time: [1.2855 ns 1.2943 ns 1.3021 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [1.3865 ns 1.4139 ns 1.4441 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [1.5311 ns 1.5669 ns 1.6046 ns]
single_metric_multicore_scalability/nthreads/1 time: [1.1927 ns 1.1981 ns 1.2049 ns]
single_metric_multicore_scalability/nthreads/4 time: [24.346 ns 25.439 ns 26.634 ns]
single_metric_multicore_scalability/nthreads/8 time: [58.666 ns 60.137 ns 61.486 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [2.7067 ns 2.7238 ns 2.7402 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [62.723 ns 66.214 ns 69.787 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [164.24 ns 170.10 ns 175.68 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [2.2915 ns 2.2960 ns 2.3012 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [2.5726 ns 2.6158 ns 2.6624 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [2.7068 ns 2.8243 ns 2.9824 ns]
*/

View File

@@ -7,7 +7,7 @@ use http_utils::error::HttpErrorBody;
use pageserver_api::models::*;
use pageserver_api::shard::TenantShardId;
pub use reqwest::Body as ReqwestBody;
use reqwest::{Certificate, IntoUrl, Method, StatusCode, Url};
use reqwest::{IntoUrl, Method, StatusCode};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -38,9 +38,6 @@ pub enum Error {
#[error("Cancelled")]
Cancelled,
#[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
CreateClient(reqwest::Error),
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -72,17 +69,8 @@ pub enum ForceAwaitLogicalSize {
}
impl Client {
pub fn new(
mgmt_api_endpoint: String,
jwt: Option<&str>,
ssl_ca_cert: Option<Certificate>,
) -> Result<Self> {
let mut http_client = reqwest::Client::builder();
if let Some(ssl_ca_cert) = ssl_ca_cert {
http_client = http_client.add_root_certificate(ssl_ca_cert);
}
let http_client = http_client.build().map_err(Error::CreateClient)?;
Ok(Self::from_client(http_client, mgmt_api_endpoint, jwt))
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
}
pub fn from_client(
@@ -113,10 +101,12 @@ impl Client {
debug_assert!(path.starts_with('/'));
let uri = format!("{}{}", self.mgmt_api_endpoint, path);
let mut req = self.client.request(Method::GET, uri);
if let Some(value) = &self.authorization_header {
req = req.header(reqwest::header::AUTHORIZATION, value);
}
let req = self.client.request(Method::GET, uri);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
};
req.send().await.map_err(Error::ReceiveBody)
}
@@ -458,21 +448,13 @@ impl Client {
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
behavior: Option<DetachBehavior>,
) -> Result<AncestorDetached> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
self.mgmt_api_endpoint
);
let mut uri = Url::parse(&uri)
.map_err(|e| Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")))?;
if let Some(behavior) = behavior {
uri.query_pairs_mut()
.append_pair("detach_behavior", &behavior.to_string());
}
self.request(Method::PUT, uri, ())
self.request(Method::PUT, &uri, ())
.await?
.json()
.await

View File

@@ -12,7 +12,7 @@ pub(crate) fn setup_logging() {
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
logging::Output::Stdout,
)
.expect("Failed to init test logging");
.expect("Failed to init test logging")
});
}

View File

@@ -15,7 +15,6 @@ hdrhistogram.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
rand.workspace = true
reqwest.workspace=true
serde.workspace = true
serde_json.workspace = true
tracing.workspace = true

View File

@@ -36,8 +36,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
None, // TODO: support ssl_ca_file for https APIs in pagebench.
)?);
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(

View File

@@ -77,8 +77,7 @@ async fn main_impl(
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
None, // TODO: support ssl_ca_file for https APIs in pagebench.
)?);
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(

View File

@@ -125,8 +125,7 @@ async fn main_impl(
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
None, // TODO: support ssl_ca_file for https APIs in pagebench.
)?);
));
if let Some(engine_str) = &args.set_io_engine {
mgmt_api_client.put_io_engine(engine_str).await?;

Some files were not shown because too many files have changed in this diff Show More