Compare commits

..

3 Commits

Author SHA1 Message Date
Jan Christian Grünhage
4e3bdc5984 nomerge: simulate run-kind=compute-rc-pr 2025-02-28 11:25:20 +01:00
Alexey Masterov
04a33e8af0 Change TEST_EXTENSIONS_TAG variable 2025-02-28 11:13:15 +01:00
Alexey Masterov
71f0235600 TEst extensions upgrade should work correctly if some neon images of compute tag are not accessible 2025-02-28 11:09:06 +01:00
300 changed files with 4458 additions and 13319 deletions

View File

@@ -0,0 +1,21 @@
## Release 202Y-MM-DD
**NB: this PR must be merged only by 'Create a merge commit'!**
### Checklist when preparing for release
- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b)
- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?
<!-- List everything that should be done **before** release, any issues / setting changes / etc -->
### Checklist after release
- [ ] Make sure instructions from PRs included in this release and labeled `manual_release_instructions` are executed (either by you or by people who wrote them).
- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files))
- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1)
- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time)
<!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->

View File

@@ -32,6 +32,3 @@ config-variables:
- NEON_DEV_AWS_ACCOUNT_ID
- NEON_PROD_AWS_ACCOUNT_ID
- AWS_ECR_REGION
- BENCHMARK_LARGE_OLTP_PROJECTID
- SLACK_ON_CALL_DEVPROD_STREAM
- SLACK_RUST_CHANNEL_ID

View File

@@ -84,13 +84,7 @@ runs:
--header "Authorization: Bearer ${API_KEY}"
)
role_name=$(echo "$roles" | jq --raw-output '
(.roles | map(select(.protected == false))) as $roles |
if any($roles[]; .name == "neondb_owner")
then "neondb_owner"
else $roles[0].name
end
')
role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
echo "role_name=${role_name}" >> $GITHUB_OUTPUT
env:
API_HOST: ${{ inputs.api_host }}
@@ -113,13 +107,13 @@ runs:
)
if [ -z "${reset_password}" ]; then
sleep $i
sleep 1
continue
fi
password=$(echo $reset_password | jq --raw-output '.role.password')
if [ "${password}" == "null" ]; then
sleep $i # increasing backoff
sleep 1
continue
fi

View File

@@ -44,11 +44,6 @@ inputs:
description: 'Postgres version to use for tests'
required: false
default: 'v16'
sanitizers:
description: 'enabled or disabled'
required: false
default: 'disabled'
type: string
benchmark_durations:
description: 'benchmark durations JSON'
required: false
@@ -64,7 +59,7 @@ runs:
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
path: /tmp/neon
aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
@@ -117,7 +112,6 @@ runs:
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
RERUN_FAILED: ${{ inputs.rerun_failed }}
PG_VERSION: ${{ inputs.pg_version }}
SANITIZERS: ${{ inputs.sanitizers }}
shell: bash -euxo pipefail {0}
run: |
# PLATFORM will be embedded in the perf test report

View File

@@ -1,16 +1,14 @@
import itertools
import json
import os
import sys
source_tag = os.getenv("SOURCE_TAG")
target_tag = os.getenv("TARGET_TAG")
branch = os.getenv("BRANCH")
dev_acr = os.getenv("DEV_ACR")
prod_acr = os.getenv("PROD_ACR")
dev_aws = os.getenv("DEV_AWS")
prod_aws = os.getenv("PROD_AWS")
aws_region = os.getenv("AWS_REGION")
build_tag = os.environ["BUILD_TAG"]
branch = os.environ["BRANCH"]
dev_acr = os.environ["DEV_ACR"]
prod_acr = os.environ["PROD_ACR"]
dev_aws = os.environ["DEV_AWS"]
prod_aws = os.environ["PROD_AWS"]
aws_region = os.environ["AWS_REGION"]
components = {
"neon": ["neon"],
@@ -41,23 +39,24 @@ registries = {
outputs: dict[str, dict[str, list[str]]] = {}
target_tags = [target_tag, "latest"] if branch == "main" else [target_tag]
target_stages = (
["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"]
)
target_tags = [build_tag, "latest"] if branch == "main" else [build_tag]
target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"]
for component_name, component_images in components.items():
for stage in target_stages:
outputs[f"{component_name}-{stage}"] = {
f"docker.io/neondatabase/{component_image}:{source_tag}": [
f"{registry}/{component_image}:{tag}"
for registry, tag in itertools.product(registries[stage], target_tags)
if not (registry == "docker.io/neondatabase" and tag == source_tag)
outputs[f"{component_name}-{stage}"] = dict(
[
(
f"docker.io/neondatabase/{component_image}:{build_tag}",
[
f"{combo[0]}/{component_image}:{combo[1]}"
for combo in itertools.product(registries[stage], target_tags)
],
)
for component_image in component_images
]
for component_image in component_images
}
)
with open(os.getenv("GITHUB_OUTPUT", "/dev/null"), "a") as f:
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
for key, value in outputs.items():
f.write(f"{key}={json.dumps(value)}\n")
print(f"Image map for {key}:\n{json.dumps(value, indent=2)}\n\n", file=sys.stderr)

View File

@@ -1,110 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
DOCS_URL="https://docs.neon.build/overview/repositories/neon.html"
message() {
if [[ -n "${GITHUB_PR_NUMBER:-}" ]]; then
gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --edit-last --body "$1" \
|| gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --body "$1"
fi
echo "$1"
}
report_error() {
message "$1
For more details, see the documentation: ${DOCS_URL}"
exit 1
}
case "$RELEASE_BRANCH" in
"release") COMPONENT="Storage" ;;
"release-proxy") COMPONENT="Proxy" ;;
"release-compute") COMPONENT="Compute" ;;
*)
report_error "Unknown release branch: ${RELEASE_BRANCH}"
;;
esac
# Identify main and release branches
MAIN_BRANCH="origin/main"
REMOTE_RELEASE_BRANCH="origin/${RELEASE_BRANCH}"
# Find merge base
MERGE_BASE=$(git merge-base "${MAIN_BRANCH}" "${REMOTE_RELEASE_BRANCH}")
echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}"
# Get the HEAD commit (last commit in PR, expected to be the merge commit)
LAST_COMMIT=$(git rev-parse HEAD)
MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}")
EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$"
if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then
report_error "Merge commit message does not match expected pattern: '<component> release YYYY-MM-DD'
Expected component: ${COMPONENT}
Found: '${MERGE_COMMIT_MESSAGE}'"
fi
echo "✅ Merge commit message is correctly formatted: '${MERGE_COMMIT_MESSAGE}'"
LAST_COMMIT_PARENTS=$(git cat-file -p "${LAST_COMMIT}" | jq -sR '[capture("parent (?<parent>[0-9a-f]{40})"; "g") | .parent]')
if [[ "$(echo "${LAST_COMMIT_PARENTS}" | jq 'length')" -ne 2 ]]; then
report_error "Last commit must be a merge commit with exactly two parents"
fi
EXPECTED_RELEASE_HEAD=$(git rev-parse "${REMOTE_RELEASE_BRANCH}")
if echo "${LAST_COMMIT_PARENTS}" | jq -e --arg rel "${EXPECTED_RELEASE_HEAD}" 'index($rel) != null' > /dev/null; then
LINEAR_HEAD=$(echo "${LAST_COMMIT_PARENTS}" | jq -r '[.[] | select(. != $rel)][0]' --arg rel "${EXPECTED_RELEASE_HEAD}")
else
report_error "Last commit must merge the release branch (${RELEASE_BRANCH})"
fi
echo "✅ Last commit correctly merges the previous commit and the release branch"
echo "Top commit of linear history: ${LINEAR_HEAD}"
MERGE_COMMIT_TREE=$(git rev-parse "${LAST_COMMIT}^{tree}")
LINEAR_HEAD_TREE=$(git rev-parse "${LINEAR_HEAD}^{tree}")
if [[ "${MERGE_COMMIT_TREE}" != "${LINEAR_HEAD_TREE}" ]]; then
report_error "Tree of merge commit (${MERGE_COMMIT_TREE}) does not match tree of linear history head (${LINEAR_HEAD_TREE})
This indicates that the merge of ${RELEASE_BRANCH} into this branch was not performed using the merge strategy 'ours'"
fi
echo "✅ Merge commit tree matches the linear history head"
EXPECTED_PREVIOUS_COMMIT="${LINEAR_HEAD}"
# Now traverse down the history, ensuring each commit has exactly one parent
CURRENT_COMMIT="${EXPECTED_PREVIOUS_COMMIT}"
while [[ "${CURRENT_COMMIT}" != "${MERGE_BASE}" && "${CURRENT_COMMIT}" != "${EXPECTED_RELEASE_HEAD}" ]]; do
CURRENT_COMMIT_PARENTS=$(git cat-file -p "${CURRENT_COMMIT}" | jq -sR '[capture("parent (?<parent>[0-9a-f]{40})"; "g") | .parent]')
if [[ "$(echo "${CURRENT_COMMIT_PARENTS}" | jq 'length')" -ne 1 ]]; then
report_error "Commit ${CURRENT_COMMIT} must have exactly one parent"
fi
NEXT_COMMIT=$(echo "${CURRENT_COMMIT_PARENTS}" | jq -r '.[0]')
if [[ "${NEXT_COMMIT}" == "${MERGE_BASE}" ]]; then
echo "✅ Reached merge base (${MERGE_BASE})"
PR_BASE="${MERGE_BASE}"
elif [[ "${NEXT_COMMIT}" == "${EXPECTED_RELEASE_HEAD}" ]]; then
echo "✅ Reached release branch (${EXPECTED_RELEASE_HEAD})"
PR_BASE="${EXPECTED_RELEASE_HEAD}"
elif [[ -z "${NEXT_COMMIT}" ]]; then
report_error "Unexpected end of commit history before reaching merge base"
fi
# Move to the next commit in the chain
CURRENT_COMMIT="${NEXT_COMMIT}"
done
echo "✅ All commits are properly ordered and linear"
echo "✅ Release PR structure is valid"
echo
message "Commits that are part of this release:
$(git log --oneline "${PR_BASE}..${LINEAR_HEAD}")"

View File

@@ -17,12 +17,6 @@
({};
.[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end))
# Ensure that each component exists, or fail
| (["storage", "compute", "proxy"] - (keys)) as $missing
| if ($missing | length) > 0 then
"Error: Found no release for \($missing | join(", "))!\n" | halt_error(1)
else . end
# Convert the resulting object into an array of formatted strings
| to_entries
| map("\(.key)=\(.value.full)")

View File

@@ -280,7 +280,7 @@ jobs:
- name: Upload Neon artifact
uses: ./.github/actions/upload
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
path: /tmp/neon
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
@@ -347,7 +347,6 @@ jobs:
real_s3_region: eu-central-1
rerun_failed: true
pg_version: ${{ matrix.pg_version }}
sanitizers: ${{ inputs.sanitizers }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
# `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
# Attempt to stop tests gracefully to generate test reports
@@ -360,6 +359,7 @@ jobs:
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
SANITIZERS: ${{ inputs.sanitizers }}
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540

View File

@@ -7,8 +7,8 @@ on:
description: 'Component name'
required: true
type: string
source-branch:
description: 'Source branch'
release-branch:
description: 'Release branch'
required: true
type: string
secrets:
@@ -30,25 +30,17 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.source-branch }}
fetch-depth: 0
ref: main
- name: Set variables
id: vars
env:
COMPONENT_NAME: ${{ inputs.component-name }}
RELEASE_BRANCH: >-
${{
false
|| inputs.component-name == 'Storage' && 'release'
|| inputs.component-name == 'Proxy' && 'release-proxy'
|| inputs.component-name == 'Compute' && 'release-compute'
}}
RELEASE_BRANCH: ${{ inputs.release-branch }}
run: |
today=$(date +'%Y-%m-%d')
echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT}
echo "release-branch=${RELEASE_BRANCH}" | tee -a ${GITHUB_OUTPUT}
- name: Configure git
run: |
@@ -57,36 +49,31 @@ jobs:
- name: Create RC branch
env:
RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
TITLE: ${{ steps.vars.outputs.title }}
run: |
git switch -c "${RC_BRANCH}"
git checkout -b "${RC_BRANCH}"
# Manually create a merge commit on the current branch, keeping the
# tree and setting the parents to the current HEAD and the HEAD of the
# release branch. This commit is what we'll fast-forward the release
# branch to when merging the release branch.
# For details on why, look at
# https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs
current_tree=$(git rev-parse 'HEAD^{tree}')
release_head=$(git rev-parse "origin/${RELEASE_BRANCH}")
current_head=$(git rev-parse HEAD)
merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}")
# Fast-forward the current branch to the newly created merge_commit
git merge --ff-only ${merge_commit}
# create an empty commit to distinguish workflow runs
# from other possible releases from the same commit
git commit --allow-empty -m "${TITLE}"
git push origin "${RC_BRANCH}"
- name: Create a PR into ${{ steps.vars.outputs.release-branch }}
- name: Create a PR into ${{ inputs.release-branch }}
env:
GH_TOKEN: ${{ secrets.ci-access-token }}
RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
RELEASE_BRANCH: ${{ inputs.release-branch }}
TITLE: ${{ steps.vars.outputs.title }}
run: |
cat << EOF > body.md
## ${TITLE}
**Please merge this Pull Request using 'Create a merge commit' button**
EOF
gh pr create --title "${TITLE}" \
--body "" \
--body-file "body.md" \
--head "${RC_BRANCH}" \
--base "${RELEASE_BRANCH}"

View File

@@ -19,18 +19,11 @@ on:
description: "Tag of the last compute release"
value: ${{ jobs.tags.outputs.compute }}
run-kind:
description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`"
description: "The kind of run we're currently in. Will be one of `pr`, `push-main`, `storage-rc`, `storage-release`, `proxy-rc`, `proxy-release`, `compute-rc`, `compute-release` or `merge_queue`"
value: ${{ jobs.tags.outputs.run-kind }}
release-pr-run-id:
description: "Only available if `run-kind in [storage-release, proxy-release, compute-release]`. Contains the run ID of the `Build and Test` workflow, assuming one with the current commit can be found."
value: ${{ jobs.tags.outputs.release-pr-run-id }}
permissions: {}
defaults:
run:
shell: bash -euo pipefail {0}
jobs:
tags:
runs-on: ubuntu-22.04
@@ -40,7 +33,6 @@ jobs:
proxy: ${{ steps.previous-releases.outputs.proxy }}
storage: ${{ steps.previous-releases.outputs.storage }}
run-kind: ${{ steps.run-kind.outputs.run-kind }}
release-pr-run-id: ${{ steps.release-pr-run-id.outputs.release-pr-run-id }}
permissions:
contents: read
steps:
@@ -54,7 +46,7 @@ jobs:
env:
RUN_KIND: >-
${{
false
'compute-rc-pr'
|| (inputs.github-event-name == 'push' && github.ref_name == 'main') && 'push-main'
|| (inputs.github-event-name == 'push' && github.ref_name == 'release') && 'storage-release'
|| (inputs.github-event-name == 'push' && github.ref_name == 'release-compute') && 'compute-release'
@@ -63,7 +55,6 @@ jobs:
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr'
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr'
|| (inputs.github-event-name == 'pull_request') && 'pr'
|| (inputs.github-event-name == 'workflow_dispatch') && 'workflow-dispatch'
|| 'unknown'
}}
run: |
@@ -91,16 +82,9 @@ jobs:
echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
;;
pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr)
BUILD_AND_TEST_RUN_ID=$(gh api --paginate \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=${CURRENT_SHA}&branch=${CURRENT_BRANCH}" \
| jq '[.workflow_runs[] | select(.name == "Build and Test")][0].id // ("Error: No matching workflow run found." | halt_error(1))')
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
;;
workflow-dispatch)
echo "tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT
;;
*)
echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!"
exit 1
@@ -117,13 +101,3 @@ jobs:
"/repos/${GITHUB_REPOSITORY}/releases" \
| jq -f .github/scripts/previous-releases.jq -r \
| tee -a "${GITHUB_OUTPUT}"
- name: Get the release PR run ID
id: release-pr-run-id
if: ${{ contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy)|(compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Falied to find Build and Test run from RC PR!" | halt_error(1))')
echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT

View File

@@ -141,8 +141,6 @@ jobs:
--ignore test_runner/performance/test_physical_replication.py
--ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
--ignore test_runner/performance/test_cumulative_statistics_persistence.py
--ignore test_runner/performance/test_perf_many_relations.py
--ignore test_runner/performance/test_perf_oltp_large_tenant.py
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"

View File

@@ -476,7 +476,7 @@ jobs:
(
!github.event.pull_request.draft
|| contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft')
|| needs.meta.outputs.run-kind == 'push-main'
|| contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind)
) && !failure() && !cancelled()
}}
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ]
@@ -487,7 +487,7 @@ jobs:
neon-image-arch:
needs: [ check-permissions, build-build-tools-image, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
strategy:
matrix:
arch: [ x64, arm64 ]
@@ -537,7 +537,7 @@ jobs:
neon-image:
needs: [ neon-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ubuntu-22.04
permissions:
id-token: write # aws-actions/configure-aws-credentials
@@ -559,7 +559,7 @@ jobs:
compute-node-image-arch:
needs: [ check-permissions, build-build-tools-image, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -651,7 +651,7 @@ jobs:
compute-node-image:
needs: [ compute-node-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -692,15 +692,15 @@ jobs:
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
vm-compute-node-image-arch:
vm-compute-node-image:
needs: [ check-permissions, meta, compute-node-image ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: [ self-hosted, large ]
strategy:
fail-fast: false
matrix:
arch: [ amd64, arm64 ]
version:
# see the comment for `compute-node-image-arch` job
- pg: v14
debian: bullseye
- pg: v15
@@ -717,7 +717,7 @@ jobs:
- name: Downloading vm-builder
run: |
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder
chmod +x vm-builder
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
@@ -738,47 +738,17 @@ jobs:
-size=2G \
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \
-target-arch=linux/${{ matrix.arch }}
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-target-arch=linux/amd64
- name: Pushing vm-compute-node image
run: |
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }}
vm-compute-node-image:
needs: [ vm-compute-node-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ubuntu-22.04
strategy:
matrix:
version:
# see the comment for `compute-node-image-arch` job
- pg: v14
- pg: v15
- pg: v16
- pg: v17
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch compute-node image
run: |
docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}
test-images:
needs: [ check-permissions, meta, neon-image, compute-node-image ]
# Depends on jobs that can get skipped
if: >-
${{
!failure()
&& !cancelled()
&& contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)
}}
if: "!failure() && !cancelled()"
strategy:
fail-fast: false
matrix:
@@ -805,7 +775,7 @@ jobs:
# Ensure that we don't have bad versions.
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
run: |
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
@@ -826,19 +796,19 @@ jobs:
env:
TAG: >-
${{
needs.meta.outputs.run-kind == 'compute-rc-pr'
contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.previous-storage-release
|| needs.meta.outputs.build-tag
}}
COMPUTE_TAG: >-
${{
contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.previous-compute-release
|| needs.meta.outputs.build-tag
}}
TEST_EXTENSIONS_TAG: >-
${{
contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
&& 'latest'
|| needs.meta.outputs.build-tag
}}
@@ -890,13 +860,7 @@ jobs:
id: generate
run: python3 .github/scripts/generate_image_maps.py
env:
SOURCE_TAG: >-
${{
contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.release-pr-run-id
|| needs.meta.outputs.build-tag
}}
TARGET_TAG: ${{ needs.meta.outputs.build-tag }}
BUILD_TAG: "${{ needs.meta.outputs.build-tag }}"
BRANCH: "${{ github.ref_name }}"
DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
@@ -906,7 +870,7 @@ jobs:
push-neon-image-dev:
needs: [ meta, generate-image-maps, neon-image ]
if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
uses: ./.github/workflows/_push-to-container-registry.yml
permissions:
id-token: write # Required for aws/azure login
@@ -924,7 +888,7 @@ jobs:
push-compute-image-dev:
needs: [ meta, generate-image-maps, vm-compute-node-image ]
if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
uses: ./.github/workflows/_push-to-container-registry.yml
permissions:
id-token: write # Required for aws/azure login
@@ -978,55 +942,16 @@ jobs:
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
secrets: inherit
push-neon-test-extensions-image-ghcr:
if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
# This is a bit of a special case so we're not using a generated image map.
add-latest-tag-to-neon-extensions-test-image:
if: github.ref_name == 'main'
needs: [ meta, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [
"ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}"
],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [
"ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}"
]
}
secrets: inherit
add-latest-tag-to-neon-test-extensions-image:
if: ${{ needs.meta.outputs.run-kind == 'push-main' }}
needs: [ meta, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [
"docker.io/neondatabase/neon-test-extensions-v16:latest",
"ghcr.io/neondatabase/neon-test-extensions-v16:latest"
],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [
"docker.io/neondatabase/neon-test-extensions-v17:latest",
"ghcr.io/neondatabase/neon-test-extensions-v17:latest"
]
}
secrets: inherit
add-release-tag-to-neon-test-extensions-image:
if: ${{ needs.meta.outputs.run-kind == 'compute-release' }}
needs: [ meta, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.release-pr-run-id }}": [
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}",
"ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}"
],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.release-pr-run-id }}": [
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}",
"ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}"
]
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
}
secrets: inherit
@@ -1111,7 +1036,7 @@ jobs:
exit 1
deploy:
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
permissions:
@@ -1225,7 +1150,7 @@ jobs:
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \
-f deployStorageBroker=false \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}} \
@@ -1233,7 +1158,7 @@ jobs:
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
-f deployStorage=true \
-f deployStorageBroker=false \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}}
@@ -1281,11 +1206,11 @@ jobs:
payload: |
channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
text: |
🔴 <!subteam^S06CJ87UMNY|@oncall-storage>: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
promote-compatibility-data:
needs: [ meta, deploy ]
needs: [ deploy ]
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -1295,6 +1220,37 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
id: fetch-last-release-pr-info
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
branch_name_and_pr_number=$(gh pr list \
--repo "${GITHUB_REPOSITORY}" \
--base release \
--state merged \
--limit 10 \
--json mergeCommit,headRefName,number \
--jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
run_id=$(gh run list \
--repo "${GITHUB_REPOSITORY}" \
--workflow build_and_test.yml \
--branch "${branch_name}" \
--json databaseId \
--limit 1 \
--jq '.[].databaseId')
last_commit_sha=$(gh pr view "${pr_number}" \
--repo "${GITHUB_REPOSITORY}" \
--json commits \
--jq '.commits[-1].oid')
echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
- uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
@@ -1305,8 +1261,8 @@ jobs:
env:
BUCKET: neon-github-public-dev
AWS_REGION: eu-central-1
COMMIT_SHA: ${{ github.sha }}
RUN_ID: ${{ needs.meta.outputs.release-pr-run-id }}
COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
run: |
old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
new_prefix="artifacts/latest"
@@ -1395,5 +1351,5 @@ jobs:
|| needs.files-changed.result == 'skipped'
|| (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|| (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind))
|| (needs.test-images.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|| needs.test-images.result == 'skipped'
|| (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))

View File

@@ -7,7 +7,7 @@ on:
required: false
type: string
schedule:
- cron: '0 10 * * *'
- cron: '0 0 * * *'
jobs:
cargo-deny:
@@ -50,9 +50,8 @@ jobs:
method: chat.postMessage
token: ${{ secrets.SLACK_BOT_TOKEN }}
payload: |
channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
text: |
Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
Fixing the problem should be fairly straight forward from the logs. If not, <#${{ vars.SLACK_RUST_CHANNEL_ID }}> is there to help.
Pinging <!subteam^S0838JPSH32|@oncall-devprod>.
Pinging @oncall-devprod.

View File

@@ -1,36 +0,0 @@
name: Fast forward merge
on:
pull_request:
types: [labeled]
branches:
- release
- release-proxy
- release-compute
jobs:
fast-forward:
if: ${{ github.event.label.name == 'fast-forward' }}
runs-on: ubuntu-22.04
steps:
- name: Remove fast-forward label to PR
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
gh pr edit ${{ github.event.pull_request.number }} --repo "${GITHUB_REPOSITORY}" --remove-label "fast-forward"
- name: Fast forwarding
uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979
# See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
if: ${{ github.event.pull_request.mergeable_state == 'clean' }}
with:
merge: true
comment: on-error
github_token: ${{ secrets.CI_ACCESS_TOKEN }}
- name: Comment if mergeable_state is not clean
if: ${{ github.event.pull_request.mergeable_state != 'clean' }}
run: |
gh pr comment ${{ github.event.pull_request.number }} \
--repo "${GITHUB_REPOSITORY}" \
--body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`."

View File

@@ -52,9 +52,8 @@ jobs:
- name: Test extension upgrade
timeout-minutes: 20
env:
NEW_COMPUTE_TAG: latest
OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
NEWTAG: latest
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
PG_VERSION: ${{ matrix.pg-version }}
FORCE_ALL_UPGRADE_TESTS: true
run: ./docker-compose/test_extensions_upgrade.sh

View File

@@ -1,147 +0,0 @@
name: large oltp benchmark
on:
# uncomment to run on push for debugging your PR
push:
branches: [ bodobolero/synthetic_oltp_workload ]
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
workflow_dispatch: # adds ability to run this manually
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow globally because we need dedicated resources which only exist once
group: large-oltp-bench-workflow
cancel-in-progress: true
jobs:
oltp:
strategy:
fail-fast: false # allow other variants to continue even if one fails
matrix:
include:
- target: new_branch
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
- target: reuse_branch
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h
TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
PG_VERSION: 16 # pre-determined by pre-determined project
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
PLATFORM: ${{ matrix.target }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
# Increase timeout to 8h, default timeout is 6h
timeout-minutes: 480
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials # necessary to download artefacts
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Create Neon Branch for large tenant
if: ${{ matrix.target == 'new_branch' }}
id: create-neon-branch-oltp-target
uses: ./.github/actions/neon-branch-create
with:
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Set up Connection String
id: set-up-connstr
run: |
case "${{ matrix.target }}" in
new_branch)
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
;;
reuse_branch)
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
;;
*)
echo >&2 "Unknown target=${{ matrix.target }}"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
- name: Benchmark pgbench with custom-scripts
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
pg_version: ${{ env.PG_VERSION }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Delete Neon Branch for large tenant
if: ${{ always() && matrix.target == 'new_branch' }}
uses: ./.github/actions/neon-branch-delete
with:
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
with:
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
slack-message: |
Periodic large oltp perf testing: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -1,24 +0,0 @@
name: Lint Release PR
on:
pull_request:
branches:
- release
- release-proxy
- release-compute
jobs:
lint-release-pr:
runs-on: ubuntu-22.04
steps:
- name: Checkout PR branch
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch full history for git operations
ref: ${{ github.event.pull_request.head.ref }}
- name: Run lint script
env:
RELEASE_BRANCH: ${{ github.base_ref }}
run: |
./.github/scripts/lint-release-pr.sh

View File

@@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# ┌───────────── hour (0 - 23)
# │ ┌───────────── day of the month (1 - 31)
# │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 */3 * * *' # Runs every 3 hours
# ┌───────────── minute (0 - 59)
# ┌───────────── hour (0 - 23)
# │ ┌───────────── day of the month (1 - 31)
# │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
workflow_dispatch: # Allows manual triggering of the workflow
inputs:
commit_hash:
@@ -78,10 +78,8 @@ jobs:
run: |
if [ -z "$INPUT_COMMIT_HASH" ]; then
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
else
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
fi
- name: Start Bench with run_id
@@ -91,7 +89,7 @@ jobs:
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-H "Authorization: Bearer $API_KEY" \
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
- name: Poll Test Status
id: poll_step

View File

@@ -8,6 +8,8 @@ on:
- .github/workflows/build-build-tools-image.yml
- .github/workflows/pre-merge-checks.yml
merge_group:
branches:
- main
defaults:
run:
@@ -17,13 +19,11 @@ defaults:
permissions: {}
jobs:
meta:
get-changed-files:
runs-on: ubuntu-22.04
outputs:
python-changed: ${{ steps.python-src.outputs.any_changed }}
rust-changed: ${{ steps.rust-src.outputs.any_changed }}
branch: ${{ steps.group-metadata.outputs.branch }}
pr-number: ${{ steps.group-metadata.outputs.pr-number }}
steps:
- uses: actions/checkout@v4
@@ -58,20 +58,12 @@ jobs:
echo "${PYTHON_CHANGED_FILES}"
echo "${RUST_CHANGED_FILES}"
- name: Merge group metadata
if: ${{ github.event_name == 'merge_group' }}
id: group-metadata
env:
MERGE_QUEUE_REF: ${{ github.event.merge_group.head_ref }}
run: |
echo $MERGE_QUEUE_REF | jq -Rr 'capture("refs/heads/gh-readonly-queue/(?<branch>.*)/pr-(?<pr_number>[0-9]+)-[0-9a-f]{40}") | ["branch=" + .branch, "pr-number=" + .pr_number] | .[]' | tee -a "${GITHUB_OUTPUT}"
build-build-tools-image:
if: |
false
|| needs.meta.outputs.python-changed == 'true'
|| needs.meta.outputs.rust-changed == 'true'
needs: [ meta ]
|| needs.get-changed-files.outputs.python-changed == 'true'
|| needs.get-changed-files.outputs.rust-changed == 'true'
needs: [ get-changed-files ]
uses: ./.github/workflows/build-build-tools-image.yml
with:
# Build only one combination to save time
@@ -80,8 +72,8 @@ jobs:
secrets: inherit
check-codestyle-python:
if: needs.meta.outputs.python-changed == 'true'
needs: [ meta, build-build-tools-image ]
if: needs.get-changed-files.outputs.python-changed == 'true'
needs: [ get-changed-files, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-python.yml
with:
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
@@ -89,8 +81,8 @@ jobs:
secrets: inherit
check-codestyle-rust:
if: needs.meta.outputs.rust-changed == 'true'
needs: [ meta, build-build-tools-image ]
if: needs.get-changed-files.outputs.rust-changed == 'true'
needs: [ get-changed-files, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-rust.yml
with:
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
@@ -109,7 +101,7 @@ jobs:
statuses: write # for `github.repos.createCommitStatus(...)`
contents: write
needs:
- meta
- get-changed-files
- check-codestyle-python
- check-codestyle-rust
runs-on: ubuntu-22.04
@@ -137,20 +129,7 @@ jobs:
run: exit 1
if: |
false
|| (github.event_name == 'merge_group' && needs.meta.outputs.branch != 'main')
|| (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.python-changed == 'true')
|| (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.rust-changed == 'true')
|| (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
|| (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true')
|| contains(needs.*.result, 'failure')
|| contains(needs.*.result, 'cancelled')
- name: Add fast-forward label to PR to trigger fast-forward merge
if: >-
${{
always()
&& github.event_name == 'merge_group'
&& contains(fromJson('["release", "release-proxy", "release-compute"]'), github.base_ref)
}}
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: >-
gh pr edit ${{ needs.meta.outputs.pr-number }} --repo "${GITHUB_REPOSITORY}" --add-label "fast-forward"

View File

@@ -38,7 +38,7 @@ jobs:
uses: ./.github/workflows/_create-release-pr.yml
with:
component-name: 'Storage'
source-branch: ${{ github.ref_name }}
release-branch: 'release'
secrets:
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
uses: ./.github/workflows/_create-release-pr.yml
with:
component-name: 'Proxy'
source-branch: ${{ github.ref_name }}
release-branch: 'release-proxy'
secrets:
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -64,6 +64,6 @@ jobs:
uses: ./.github/workflows/_create-release-pr.yml
with:
component-name: 'Compute'
source-branch: ${{ github.ref_name }}
release-branch: 'release-compute'
secrets:
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}

View File

@@ -1,8 +1,8 @@
# Autoscaling
/libs/vm_monitor/ @neondatabase/autoscaling
# DevProd & PerfCorr
/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness
# DevProd
/.github/ @neondatabase/developer-productivity
# Compute
/pgxn/ @neondatabase/compute

600
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -53,6 +53,7 @@ anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
backtrace = "0.3.74"
flate2 = "1.0.26"
assert-json-diff = "2"
async-stream = "0.3"
@@ -67,7 +68,6 @@ aws-credential-types = "1.2.0"
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
aws-types = "1.3"
axum = { version = "0.8.1", features = ["ws"] }
axum-extra = { version = "0.10.0", features = ["typed-header"] }
base64 = "0.13.0"
bincode = "1.3"
bindgen = "0.71"
@@ -95,7 +95,6 @@ futures = "0.3"
futures-core = "0.3"
futures-util = "0.3"
git-version = "0.3"
governor = "0.8"
hashbrown = "0.14"
hashlink = "0.9.1"
hdrhistogram = "7.5.2"
@@ -106,18 +105,19 @@ hostname = "0.4"
http = {version = "1.1.0", features = ["std"]}
http-types = { version = "2", default-features = false }
http-body-util = "0.1.2"
humantime = "2.2"
humantime = "2.1"
humantime-serde = "1.1.1"
hyper0 = { package = "hyper", version = "0.14" }
hyper = "1.4"
hyper-util = "0.1"
tokio-tungstenite = "0.21.0"
indexmap = { version = "2", features = ["serde"] }
indexmap = "2"
indoc = "2"
inferno = "0.12.0"
ipnet = "2.10.0"
itertools = "0.10"
itoa = "1.0.11"
jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
jemalloc_pprof = "0.6"
jsonwebtoken = "9"
lasso = "0.7"
libc = "0.2"
@@ -126,9 +126,7 @@ measured = { version = "0.0.22", features=["lasso"] }
measured-process = { version = "0.0.22" }
memoffset = "0.9"
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
# Do not update to >= 7.0.0, at least. The update will have a significant impact
# on compute startup metrics (start_postgres_ms), >= 25% degradation.
notify = "6.0.0"
notify = "8.0.0"
num_cpus = "1.15"
num-traits = "0.2.15"
once_cell = "1.13"
@@ -141,7 +139,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] }
procfs = "0.16"
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
prost = "0.13"
@@ -157,7 +155,6 @@ rpds = "0.13"
rustc-hash = "1.1.0"
rustls = { version = "0.23.16", default-features = false }
rustls-pemfile = "2"
rustls-pki-types = "1.11"
scopeguard = "1.1"
sysinfo = "0.29.2"
sd-notify = "0.4.1"
@@ -195,7 +192,7 @@ toml = "0.8"
toml_edit = "0.22"
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
tower = { version = "0.5.2", default-features = false }
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
# This revision uses opentelemetry 0.27. There's no tag for it.
tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }
@@ -221,7 +218,7 @@ zerocopy = { version = "0.7", features = ["derive"] }
json-structural-diff = { version = "0.2.0" }
## TODO replace this with tracing
env_logger = "0.11"
env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed

View File

@@ -11,16 +11,15 @@ ICU_PREFIX_DIR := /usr/local/icu
#
BUILD_TYPE ?= debug
WITH_SANITIZERS ?= no
PG_CFLAGS = -fsigned-char
ifeq ($(BUILD_TYPE),release)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
PG_CFLAGS += -O2 -g3 $(CFLAGS)
PG_CFLAGS = -O2 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
# Unfortunately, `--profile=...` is a nightly feature
CARGO_BUILD_FLAGS += --release
else ifeq ($(BUILD_TYPE),debug)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
PG_CFLAGS += -O0 -g3 $(CFLAGS)
PG_CFLAGS = -O0 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
else
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
@@ -160,8 +159,6 @@ postgres-%: postgres-configure-% \
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
+@echo "Compiling pageinspect $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+@echo "Compiling pg_trgm $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
+@echo "Compiling amcheck $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
+@echo "Compiling test_decoding $*"

View File

@@ -162,7 +162,7 @@ FROM build-deps AS pg-build
ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION:?} postgres
RUN cd postgres && \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
--with-icu --with-libxml --with-libxslt --with-lz4" && \
if [ "${PG_VERSION:?}" != "v14" ]; then \
# zstd is available only from PG15
@@ -1484,7 +1484,7 @@ WORKDIR /ext-src
COPY compute/patches/pg_duckdb_v031.patch .
COPY compute/patches/duckdb_v120.patch .
# pg_duckdb build requires source dir to be a git repo to get submodules
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# - extension management function duckdb.install_extension()
# - access to duckdb.extensions table and its sequence
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
@@ -1499,8 +1499,8 @@ ARG PG_VERSION
COPY --from=pg_duckdb-src /ext-src/ /ext-src/
WORKDIR /ext-src/pg_duckdb-src
RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
#########################################################################################
#
# Layer "pg_repack"
@@ -1735,8 +1735,6 @@ RUN set -e \
libevent-dev \
libtool \
pkg-config \
libcurl4-openssl-dev \
libssl-dev \
&& apt clean && rm -rf /var/lib/apt/lists/*
# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
@@ -1745,7 +1743,7 @@ RUN set -e \
&& git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
&& cd pgbouncer \
&& ./autogen.sh \
&& ./configure --prefix=/usr/local/pgbouncer \
&& ./configure --prefix=/usr/local/pgbouncer --without-openssl \
&& make -j $(nproc) dist_man_MANS= \
&& make install dist_man_MANS=
@@ -1760,15 +1758,15 @@ ARG TARGETARCH
# test_runner/regress/test_compute_metrics.py
# See comment on the top of the file regading `echo`, `-e` and `\n`
RUN if [ "$TARGETARCH" = "amd64" ]; then\
postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
else\
postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\
pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
fi\
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\
| tar xzf - --strip-components=1 -C.\
&& curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
| tar xzf - --strip-components=1 -C.\
@@ -1935,7 +1933,6 @@ RUN apt update && \
locales \
procps \
ca-certificates \
rsyslog \
$VERSION_INSTALLS && \
apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
@@ -1981,13 +1978,6 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
# Make the libraries we built available
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
# rsyslog config permissions
# directory for rsyslogd pid file
RUN mkdir /var/run/rsyslogd && \
chown -R postgres:postgres /var/run/rsyslogd && \
chown -R postgres:postgres /etc/rsyslog.d/
ENV LANG=en_US.utf8
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -29,7 +29,6 @@
import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
import 'sql_exporter/lfc_cache_size_limit.libsonnet',
import 'sql_exporter/lfc_chunk_size.libsonnet',
import 'sql_exporter/lfc_hits.libsonnet',
import 'sql_exporter/lfc_misses.libsonnet',
import 'sql_exporter/lfc_used.libsonnet',

View File

@@ -1,5 +1 @@
SELECT sum(pg_database_size(datname)) AS total
FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2;
SELECT sum(pg_database_size(datname)) AS total FROM pg_database;

View File

@@ -1,10 +0,0 @@
{
metric_name: 'lfc_chunk_size',
type: 'gauge',
help: 'LFC chunk size, measured in 8KiB pages',
key_labels: null,
values: [
'lfc_chunk_size_pages',
],
query: importstr 'sql_exporter/lfc_chunk_size.sql',
}

View File

@@ -1 +0,0 @@
SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages';

View File

@@ -1,20 +1,10 @@
-- We export stats for 10 non-system databases. Without this limit it is too
-- easy to abuse the system by creating lots of databases.
SELECT pg_database_size(datname) AS db_size,
deadlocks,
tup_inserted AS inserted,
tup_updated AS updated,
tup_deleted AS deleted,
datname
SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
tup_updated AS updated, tup_deleted AS deleted, datname
FROM pg_stat_database
WHERE datname IN (
SELECT datname FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2
AND datname <> 'postgres'
AND NOT datistemplate
ORDER BY oid
LIMIT 10
WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
);

View File

@@ -39,10 +39,6 @@ commands:
user: nobody
sysvInitAction: respawn
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
- name: rsyslogd
user: postgres
sysvInitAction: respawn
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
@@ -58,7 +54,7 @@ files:
# regardless of hostname (ALL)
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -73,12 +69,6 @@ files:
}
memory {}
}
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
# compute_ctl will rewrite this file with the actual configuration, if needed.
- filename: compute_rsyslog.conf
content: |
*.* /dev/null
$IncludeConfig /etc/rsyslog.d/*.conf
build: |
# Build cgroup-tools
#
@@ -142,12 +132,6 @@ merge: |
RUN set -e \
&& chmod 0644 /etc/cgconfig.conf
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/

View File

@@ -39,10 +39,6 @@ commands:
user: nobody
sysvInitAction: respawn
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
- name: rsyslogd
user: postgres
sysvInitAction: respawn
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
@@ -58,7 +54,7 @@ files:
# regardless of hostname (ALL)
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -73,12 +69,6 @@ files:
}
memory {}
}
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
# compute_ctl will rewrite this file with the actual configuration, if needed.
- filename: compute_rsyslog.conf
content: |
*.* /dev/null
$IncludeConfig /etc/rsyslog.d/*.conf
build: |
# Build cgroup-tools
#
@@ -138,11 +128,6 @@ merge: |
RUN set -e \
&& chmod 0644 /etc/cgconfig.conf
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/

View File

@@ -17,7 +17,6 @@ aws-sdk-kms.workspace = true
aws-smithy-types.workspace = true
anyhow.workspace = true
axum = { workspace = true, features = [] }
axum-extra.workspace = true
camino.workspace = true
chrono.workspace = true
cfg-if.workspace = true
@@ -26,8 +25,6 @@ fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
indexmap.workspace = true
jsonwebtoken.workspace = true
metrics.workspace = true
nix.workspace = true
notify.workspace = true
@@ -35,19 +32,16 @@ num_cpus.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true
opentelemetry_sdk.workspace = true
p256 = { version = "0.13", features = ["pem"] }
postgres.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["json"] }
ring = "0.17"
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
spki = { version = "0.7.3", features = ["std"] }
tar.workspace = true
tower.workspace = true
tower-http.workspace = true
reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-util.workspace = true
@@ -61,7 +55,6 @@ thiserror.workspace = true
url.workspace = true
uuid.workspace = true
walkdir.workspace = true
x509-cert = { version = "0.2.5" }
postgres_initdb.workspace = true
compute_api.workspace = true

View File

@@ -33,27 +33,39 @@
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway \
//! ```
use std::collections::HashMap;
use std::ffi::OsString;
use std::fs::File;
use std::path::Path;
use std::process::exit;
use std::sync::mpsc;
use std::str::FromStr;
use std::sync::atomic::Ordering;
use std::sync::{Arc, Condvar, Mutex, RwLock, mpsc};
use std::thread;
use std::time::Duration;
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Parser;
use compute_api::responses::ComputeCtlConfig;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
use compute_tools::compute::{
ComputeNode, ComputeState, PG_PID, ParsedSpec, forward_termination_signal,
};
use compute_tools::configurator::launch_configurator;
use compute_tools::disk_quota::set_disk_quota;
use compute_tools::extension_server::get_pg_version_string;
use compute_tools::http::server::Server;
use compute_tools::logger::*;
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::spec::*;
use compute_tools::swap::resize_swap;
use rlimit::{Resource, setrlimit};
use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM};
use signal_hook::iterator::Signals;
use tracing::{error, info};
use tracing::{error, info, warn};
use url::Url;
use utils::failpoint_support;
@@ -152,41 +164,29 @@ fn main() -> Result<()> {
// enable core dumping for all child processes
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let (pg_handle, start_pg_result) = {
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();
let cli_spec = try_spec_from_cli(&cli)?;
let cli_spec = try_spec_from_cli(&cli)?;
let compute_node = ComputeNode::new(
ComputeNodeParams {
compute_id: cli.compute_id,
connstr,
pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port,
ext_remote_storage: cli.remote_ext_config.clone(),
resize_swap_on_bind: cli.resize_swap_on_bind,
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
#[cfg(target_os = "linux")]
filecache_connstr: cli.filecache_connstr,
#[cfg(target_os = "linux")]
cgroup: cli.cgroup,
#[cfg(target_os = "linux")]
vm_monitor_addr: cli.vm_monitor_addr,
build_tag,
let compute = wait_spec(build_tag, &cli, cli_spec)?;
live_config_allowed: cli_spec.live_config_allowed,
},
cli_spec.spec,
cli_spec.compute_ctl_config,
)?;
start_postgres(&cli, compute)?
let exit_code = compute_node.run()?;
// Startup is finished, exit the startup tracing span
};
// PostgreSQL is now running, if startup was successful. Wait until it exits.
let wait_pg_result = wait_postgres(pg_handle)?;
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
maybe_delay_exit(delay_exit);
scenario.teardown();
deinit_and_exit(exit_code);
deinit_and_exit(wait_pg_result);
}
async fn init() -> Result<String> {
@@ -207,6 +207,56 @@ async fn init() -> Result<String> {
Ok(build_tag)
}
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
//
// This is used to propagate the context for the 'start_compute' operation
// from the neon control plane. This allows linking together the wider
// 'start_compute' operation that creates the compute container, with the
// startup actions here within the container.
//
// There is no standard for passing context in env variables, but a lot of
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
//
// Switch to the startup context here, and exit it once the startup has
// completed and Postgres is up and running.
//
// If this pod is pre-created without binding it to any particular endpoint
// yet, this isn't the right place to enter the startup context. In that
// case, the control plane should pass the tracing context as part of the
// /configure API call.
//
// NOTE: This is supposed to only cover the *startup* actions. Once
// postgres is configured and up-and-running, we exit this span. Any other
// actions that are performed on incoming HTTP requests, for example, are
// performed in separate spans.
//
// XXX: If the pod is restarted, we perform the startup actions in the same
// context as the original startup actions, which probably doesn't make
// sense.
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
if let Ok(val) = std::env::var("TRACEPARENT") {
startup_tracing_carrier.insert("traceparent".to_string(), val);
}
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry_sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new()
.extract(&startup_tracing_carrier)
.attach();
info!("startup tracing context attached");
Some(guard)
} else {
None
}
}
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
// First, try to get cluster spec from the cli argument
if let Some(ref spec_json) = cli.spec_json {
@@ -257,7 +307,357 @@ struct CliSpecParams {
live_config_allowed: bool,
}
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
fn wait_spec(
build_tag: String,
cli: &Cli,
CliSpecParams {
spec,
live_config_allowed,
compute_ctl_config: _,
}: CliSpecParams,
) -> Result<Arc<ComputeNode>> {
let mut new_state = ComputeState::new();
let spec_set;
if let Some(spec) = spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
info!("new pspec.spec: {:?}", pspec.spec);
new_state.pspec = Some(pspec);
spec_set = true;
} else {
spec_set = false;
}
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let conn_conf = postgres::config::Config::from_str(connstr.as_str())
.context("cannot build postgres config from connstr")?;
let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
.context("cannot build tokio postgres config from connstr")?;
let compute_node = ComputeNode {
compute_id: cli.compute_id.clone(),
connstr,
conn_conf,
tokio_conn_conf,
pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port,
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_remote_storage: cli.remote_ext_config.clone(),
ext_download_progress: RwLock::new(HashMap::new()),
build_tag,
};
let compute = Arc::new(compute_node);
// If this is a pooled VM, prewarm before starting HTTP server and becoming
// available for binding. Prewarming helps Postgres start quicker later,
// because QEMU will already have its memory allocated from the host, and
// the necessary binaries will already be cached.
if !spec_set {
compute.prewarm_postgres()?;
}
// Launch the external HTTP server first, so that we can serve control plane
// requests while configuration is still in progress.
Server::External(cli.external_http_port).launch(&compute);
// The internal HTTP server could be launched later, but there isn't much
// sense in waiting.
Server::Internal(cli.internal_http_port).launch(&compute);
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
let mut state = compute.state.lock().unwrap();
while state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap();
if state.status == ComputeStatus::ConfigurationPending {
info!("got spec, continue configuration");
// Spec is already set by the http server handler.
break;
}
}
// Record for how long we slept waiting for the spec.
let now = Utc::now();
state.metrics.wait_for_spec_ms = now
.signed_duration_since(state.start_time)
.to_std()
.unwrap()
.as_millis() as u64;
// Reset start time, so that the total startup time that is calculated later will
// not include the time that we waited for the spec.
state.start_time = now;
}
launch_lsn_lease_bg_task_for_static(&compute);
Ok(compute)
}
fn start_postgres(
cli: &Cli,
compute: Arc<ComputeNode>,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state.
let mut state = compute.state.lock().unwrap();
// Create a tracing span for the startup operation.
//
// We could otherwise just annotate the function with #[instrument], but if
// we're being configured from a /configure HTTP request, we want the
// startup to be considered part of the /configure request.
let _this_entered = {
// Temporarily enter the /configure request's span, so that the new span
// becomes its child.
let _parent_entered = state.startup_span.take().map(|p| p.entered());
tracing::info_span!("start_postgres")
}
.entered();
state.set_status(ComputeStatus::Init, &compute.state_changed);
info!(
"running compute with features: {:?}",
state.pspec.as_ref().unwrap().spec.features
);
// before we release the mutex, fetch some parameters for later.
let &ComputeSpec {
swap_size_bytes,
disk_quota_bytes,
#[cfg(target_os = "linux")]
disable_lfc_resizing,
..
} = &state.pspec.as_ref().unwrap().spec;
drop(state);
// Launch remaining service threads
let _monitor_handle = launch_monitor(&compute);
let _configurator_handle = launch_configurator(&compute);
let mut prestartup_failed = false;
let mut delay_exit = false;
// Resize swap to the desired size if the compute spec says so
if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
// To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
// *before* starting postgres.
//
// In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
// carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
// OOM-killed during startup because swap wasn't available yet.
match resize_swap(size_bytes) {
Ok(()) => {
let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%size_bytes, %size_mib, "resized swap");
}
Err(err) => {
let err = err.context("failed to resize swap");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
compute.set_failed_status(err);
delay_exit = true;
}
}
}
// Set disk quota if the compute spec says so
if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
(disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
{
match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
Ok(()) => {
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%disk_quota_bytes, %size_mib, "set disk quota");
}
Err(err) => {
let err = err.context("failed to set disk quota");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
compute.set_failed_status(err);
delay_exit = true;
}
}
}
// Start Postgres
let mut pg = None;
if !prestartup_failed {
pg = match compute.start_compute() {
Ok(pg) => {
info!(postmaster_pid = %pg.0.id(), "Postgres was started");
Some(pg)
}
Err(err) => {
error!("could not start the compute node: {:#}", err);
compute.set_failed_status(err);
delay_exit = true;
None
}
};
} else {
warn!("skipping postgres startup because pre-startup step failed");
}
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
// because it requires cgroups.
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
use std::env;
use tokio_util::sync::CancellationToken;
// This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new();
// don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
None
} else {
Some(cli.filecache_connstr.clone())
};
let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
let vm_monitor = tokio::spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args {
cgroup: Some(cli.cgroup.clone()),
pgconnstr,
addr: cli.vm_monitor_addr.clone(),
})),
token.clone(),
));
Some(vm_monitor)
} else {
None
};
}
}
Ok((
pg,
StartPostgresResult {
delay_exit,
compute,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
vm_monitor,
},
))
}
type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
struct StartPostgresResult {
delay_exit: bool,
// passed through from WaitSpecResult
compute: Arc<ComputeNode>,
#[cfg(target_os = "linux")]
token: tokio_util::sync::CancellationToken,
#[cfg(target_os = "linux")]
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
PG_PID.store(0, Ordering::SeqCst);
// Process has exited. Wait for the log collecting task to finish.
let _ = tokio::runtime::Handle::current()
.block_on(logs_handle)
.map_err(|e| tracing::error!("log task panicked: {:?}", e));
info!("Postgres exited with code {}, shutting down", ecode);
exit_code = ecode.code()
}
Ok(WaitPostgresResult { exit_code })
}
struct WaitPostgresResult {
exit_code: Option<i32>,
}
fn cleanup_after_postgres_exit(
StartPostgresResult {
mut delay_exit,
compute,
#[cfg(target_os = "linux")]
vm_monitor,
#[cfg(target_os = "linux")]
token,
}: StartPostgresResult,
) -> Result<bool> {
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
if let Some(handle) = vm_monitor {
// Kills all threads spawned by the monitor
token.cancel();
// Kills the actual task running the monitor
handle.abort();
}
}
}
// Maybe sync safekeepers again, to speed up next startup
let compute_state = compute.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
info!("syncing safekeepers on shutdown");
let storage_auth_token = pspec.storage_auth_token.clone();
let lsn = compute.sync_safekeepers(storage_auth_token)?;
info!("synced safekeepers at lsn {lsn}");
}
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::TerminationPending {
state.status = ComputeStatus::Terminated;
compute.state_changed.notify_all();
// we were asked to terminate gracefully, don't exit to avoid restart
delay_exit = true
}
drop(state);
if let Err(err) = compute.check_for_core_dumps() {
error!("error while checking for core dumps: {err:?}");
}
Ok(delay_exit)
}
fn maybe_delay_exit(delay_exit: bool) {
// If launch failed, keep serving HTTP requests for a while, so the cloud
// control plane can get the actual error.
if delay_exit {
info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30));
}
}
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may
// hang for quite some time, see, for example:

View File

@@ -58,14 +58,14 @@ pub async fn get_database_schema(
compute: &Arc<ComputeNode>,
dbname: &str,
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>> + use<>, SchemaDumpError> {
let pgbin = &compute.params.pgbin;
let pgbin = &compute.pgbin;
let basepath = Path::new(pgbin).parent().unwrap();
let pgdump = basepath.join("pg_dump");
// Replace the DB in the connection string and disable it to parts.
// This is the only option to handle DBs with special characters.
let conf = postgres_conf_for_db(&compute.params.connstr, dbname)
.map_err(|_| SchemaDumpError::Unexpected)?;
let conf =
postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
let host = conf
.get_hosts()
.first()

File diff suppressed because it is too large Load Diff

View File

@@ -1,18 +1,12 @@
use anyhow::Result;
use std::fmt::Write as FmtWrite;
use std::fs::{File, OpenOptions};
use std::io;
use std::io::Write;
use std::io::prelude::*;
use std::path::Path;
use compute_api::responses::TlsConfig;
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
use anyhow::Result;
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
use crate::pg_helpers::{
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
};
use crate::tls::{self, SERVER_CRT, SERVER_KEY};
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value};
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
@@ -40,12 +34,10 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
/// Create or completely rewrite configuration file specified by `path`
pub fn write_postgres_conf(
pgdata_path: &Path,
path: &Path,
spec: &ComputeSpec,
extension_server_port: u16,
tls_config: &Option<TlsConfig>,
) -> Result<()> {
let path = pgdata_path.join("postgresql.conf");
// File::create() destroys the file content if it exists.
let mut file = File::create(path)?;
@@ -63,20 +55,10 @@ pub fn write_postgres_conf(
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if !spec.safekeeper_connstrings.is_empty() {
let mut neon_safekeepers_value = String::new();
tracing::info!(
"safekeepers_connstrings is not zero, gen: {:?}",
spec.safekeepers_generation
);
// If generation is given, prepend sk list with g#number:
if let Some(generation) = spec.safekeepers_generation {
write!(neon_safekeepers_value, "g#{}:", generation)?;
}
neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
writeln!(
file,
"neon.safekeepers={}",
escape_conf_value(&neon_safekeepers_value)
escape_conf_value(&spec.safekeeper_connstrings.join(","))
)?;
}
if let Some(s) = &spec.tenant_id {
@@ -90,20 +72,6 @@ pub fn write_postgres_conf(
)?;
}
// tls
if let Some(tls_config) = tls_config {
writeln!(file, "ssl = on")?;
// postgres requires the keyfile to be in a secure file,
// currently too complicated to ensure that at the VM level,
// so we just copy them to another file instead. :shrug:
tls::update_key_path_blocking(pgdata_path, tls_config);
// these are the default, but good to be explicit.
writeln!(file, "ssl_cert_file = '{}'", SERVER_CRT)?;
writeln!(file, "ssl_key_file = '{}'", SERVER_KEY)?;
}
// Locales
if cfg!(target_os = "macos") {
writeln!(file, "lc_messages='C'")?;
@@ -158,55 +126,6 @@ pub fn write_postgres_conf(
writeln!(file, "# Managed by compute_ctl: end")?;
}
// If audit logging is enabled, configure pgaudit.
//
// Note, that this is called after the settings from spec are written.
// This way we always override the settings from the spec
// and don't allow the user or the control plane admin to change them.
if let ComputeAudit::Hipaa = spec.audit_log_level {
writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
// This log level is very verbose
// but this is necessary for HIPAA compliance.
// Exclude 'misc' category, because it doesn't contain anythig relevant.
writeln!(file, "pgaudit.log='all, -misc'")?;
writeln!(file, "pgaudit.log_parameter=on")?;
// Disable logging of catalog queries
// The catalog doesn't contain sensitive data, so we don't need to audit it.
writeln!(file, "pgaudit.log_catalog=off")?;
// Set log rotation to 5 minutes
// TODO: tune this after performance testing
writeln!(file, "pgaudit.log_rotation_age=5")?;
// Add audit shared_preload_libraries, if they are not present.
//
// The caller who sets the flag is responsible for ensuring that the necessary
// shared_preload_libraries are present in the compute image,
// otherwise the compute start will fail.
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
let mut extra_shared_preload_libraries = String::new();
if !libs.contains("pgaudit") {
extra_shared_preload_libraries.push_str(",pgaudit");
}
if !libs.contains("pgauditlogtofile") {
extra_shared_preload_libraries.push_str(",pgauditlogtofile");
}
writeln!(
file,
"shared_preload_libraries='{}{}'",
libs, extra_shared_preload_libraries
)?;
} else {
// Typically, this should be unreacheable,
// because we always set at least some shared_preload_libraries in the spec
// but let's handle it explicitly anyway.
writeln!(
file,
"shared_preload_libraries='neon,pgaudit,pgauditlogtofile'"
)?;
}
writeln!(file, "# Managed by compute_ctl audit settings: end")?;
}
writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
if spec.drop_subscriptions_before_start {

View File

@@ -1,11 +0,0 @@
# Load imfile module to read log files
module(load="imfile")
# Input configuration for log files in the specified directory
# Replace {log_directory} with the directory containing the log files
input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
# the directory to store rsyslog state files
global(workDirectory="/var/log/rsyslog")
# Forward logs to remote syslog server
*.* @@{remote_endpoint}

View File

@@ -202,24 +202,8 @@ pub async fn download_extension(
// move contents of the libdir / sharedir in unzipped archive to the correct local paths
for paths in [sharedir_paths, libdir_paths] {
let (zip_dir, real_dir) = paths;
let dir = match std::fs::read_dir(&zip_dir) {
Ok(dir) => dir,
Err(e) => match e.kind() {
// In the event of a SQL-only extension, there would be nothing
// to move from the lib/ directory, so note that in the log and
// move on.
std::io::ErrorKind::NotFound => {
info!("nothing to move from {}", zip_dir);
continue;
}
_ => return Err(anyhow::anyhow!(e)),
},
};
info!("mv {zip_dir:?}/* {real_dir:?}");
for file in dir {
for file in std::fs::read_dir(zip_dir)? {
let old_file = file?.path();
let new_file =
Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
@@ -269,31 +253,27 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
}
}
// Do request to extension storage proxy, e.g.,
// Do request to extension storage proxy, i.e.
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
// using HTTP GET and return the response body as bytes.
// using HHTP GET
// and return the response body as bytes
//
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
let uri = format!("{}/{}", ext_remote_storage, ext_path);
let filename = Path::new(ext_path)
.file_name()
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
.to_str()
.unwrap_or("unknown")
.to_string();
info!("Downloading extension file '{}' from uri {}", filename, uri);
info!("Download extension {} from uri {}", ext_path, uri);
match do_extension_server_request(&uri).await {
Ok(resp) => {
info!("Successfully downloaded remote extension data {}", ext_path);
REMOTE_EXT_REQUESTS_TOTAL
.with_label_values(&[&StatusCode::OK.to_string(), &filename])
.with_label_values(&[&StatusCode::OK.to_string()])
.inc();
Ok(resp)
}
Err((msg, status)) => {
REMOTE_EXT_REQUESTS_TOTAL
.with_label_values(&[&status, &filename])
.with_label_values(&[&status])
.inc();
bail!(msg);
}

View File

@@ -1,9 +1,7 @@
pub(crate) mod json;
pub(crate) mod path;
pub(crate) mod query;
pub(crate) mod request_id;
pub(crate) use json::Json;
pub(crate) use path::Path;
pub(crate) use query::Query;
pub(crate) use request_id::RequestId;

View File

@@ -1,86 +0,0 @@
use std::{
fmt::Display,
ops::{Deref, DerefMut},
};
use axum::{extract::FromRequestParts, response::IntoResponse};
use http::{StatusCode, request::Parts};
use crate::http::{JsonResponse, headers::X_REQUEST_ID};
/// Extract the request ID from the `X-Request-Id` header.
#[derive(Debug, Clone, Default)]
pub(crate) struct RequestId(pub String);
#[derive(Debug)]
/// Rejection used for [`RequestId`].
///
/// Contains one variant for each way the [`RequestId`] extractor can
/// fail.
pub(crate) enum RequestIdRejection {
/// The request is missing the header.
MissingRequestId,
/// The value of the header is invalid UTF-8.
InvalidUtf8,
}
impl RequestIdRejection {
pub fn status(&self) -> StatusCode {
match self {
RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR,
RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST,
}
}
pub fn message(&self) -> String {
match self {
RequestIdRejection::MissingRequestId => "request ID is missing",
RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8",
}
.to_string()
}
}
impl IntoResponse for RequestIdRejection {
fn into_response(self) -> axum::response::Response {
JsonResponse::error(self.status(), self.message())
}
}
impl<S> FromRequestParts<S> for RequestId
where
S: Send + Sync,
{
type Rejection = RequestIdRejection;
async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
match parts.headers.get(X_REQUEST_ID) {
Some(value) => match value.to_str() {
Ok(request_id) => Ok(Self(request_id.to_string())),
Err(_) => Err(RequestIdRejection::InvalidUtf8),
},
None => Err(RequestIdRejection::MissingRequestId),
}
}
}
impl Deref for RequestId {
type Target = String;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for RequestId {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl Display for RequestId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.0)
}
}

View File

@@ -1,2 +0,0 @@
/// Constant for `X-Request-Id` header.
pub const X_REQUEST_ID: &str = "x-request-id";

View File

@@ -1,145 +0,0 @@
use std::{collections::HashSet, net::SocketAddr};
use anyhow::{Result, anyhow};
use axum::{RequestExt, body::Body, extract::ConnectInfo};
use axum_extra::{
TypedHeader,
headers::{Authorization, authorization::Bearer},
};
use futures::future::BoxFuture;
use http::{Request, Response, StatusCode};
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
use serde::Deserialize;
use tower_http::auth::AsyncAuthorizeRequest;
use tracing::warn;
use crate::http::{JsonResponse, extract::RequestId};
#[derive(Clone, Debug, Deserialize)]
pub(in crate::http) struct Claims {
compute_id: String,
}
#[derive(Clone, Debug)]
pub(in crate::http) struct Authorize {
compute_id: String,
jwks: JwkSet,
validation: Validation,
}
impl Authorize {
pub fn new(compute_id: String, jwks: JwkSet) -> Self {
let mut validation = Validation::new(Algorithm::EdDSA);
// Nothing is currently required
validation.required_spec_claims = HashSet::new();
validation.validate_exp = true;
// Unused by the control plane
validation.validate_aud = false;
// Unused by the control plane
validation.validate_nbf = false;
Self {
compute_id,
jwks,
validation,
}
}
}
impl AsyncAuthorizeRequest<Body> for Authorize {
type RequestBody = Body;
type ResponseBody = Body;
type Future = BoxFuture<'static, Result<Request<Body>, Response<Self::ResponseBody>>>;
fn authorize(&mut self, mut request: Request<Body>) -> Self::Future {
let compute_id = self.compute_id.clone();
let jwks = self.jwks.clone();
let validation = self.validation.clone();
Box::pin(async move {
let request_id = request.extract_parts::<RequestId>().await.unwrap();
// TODO: Remove this check after a successful rollout
if jwks.keys.is_empty() {
warn!(%request_id, "Authorization has not been configured");
return Ok(request);
}
let connect_info = request
.extract_parts::<ConnectInfo<SocketAddr>>()
.await
.unwrap();
// In the event the request is coming from the loopback interface,
// allow all requests
if connect_info.ip().is_loopback() {
warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
return Ok(request);
}
let TypedHeader(Authorization(bearer)) = request
.extract_parts::<TypedHeader<Authorization<Bearer>>>()
.await
.map_err(|_| {
JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token")
})?;
let data = match Self::verify(&jwks, bearer.token(), &validation) {
Ok(claims) => claims,
Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
};
if data.claims.compute_id != compute_id {
return Err(JsonResponse::error(
StatusCode::UNAUTHORIZED,
"invalid claims in authorization token",
));
}
// Make claims available to any subsequent middleware or request
// handlers
request.extensions_mut().insert(data.claims);
Ok(request)
})
}
}
impl Authorize {
/// Verify the token using the JSON Web Key set and return the token data.
fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
debug_assert!(!jwks.keys.is_empty());
for jwk in jwks.keys.iter() {
let decoding_key = match DecodingKey::from_jwk(jwk) {
Ok(key) => key,
Err(e) => {
warn!(
"Failed to construct decoding key from {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
continue;
}
};
match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
Ok(data) => return Ok(data),
Err(e) => {
warn!(
"Failed to decode authorization token using {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
continue;
}
}
}
Err(anyhow!("Failed to verify authorization token"))
}
}

View File

@@ -1,2 +0,0 @@
pub(in crate::http) mod authorize;
pub(in crate::http) mod request_id;

View File

@@ -1,16 +0,0 @@
use axum::{extract::Request, middleware::Next, response::Response};
use uuid::Uuid;
use crate::http::headers::X_REQUEST_ID;
/// This middleware function allows compute_ctl to generate its own request ID
/// if one isn't supplied. The control plane will always send one as a UUID. The
/// neon Postgres extension on the other hand does not send one.
pub async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
let headers = request.headers_mut();
if !headers.contains_key(X_REQUEST_ID) {
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
}
next.run(request).await
}

View File

@@ -7,8 +7,6 @@ use serde::Serialize;
use tracing::error;
mod extract;
mod headers;
mod middleware;
mod routes;
pub mod server;

View File

@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
State(compute): State<Arc<ComputeNode>>,
request: Json<ConfigurationRequest>,
) -> Response {
if !compute.params.live_config_allowed {
if !compute.live_config_allowed {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"live configuration is not allowed for this compute node".to_string(),

View File

@@ -18,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams {
/// Download a remote extension.
pub(in crate::http) async fn download_extension(
Path(filename): Path<String>,
ext_server_params: Query<ExtensionServerParams>,
params: Query<ExtensionServerParams>,
State(compute): State<Arc<ComputeNode>>,
) -> Response {
// Don't even try to download extensions if no remote storage is configured
if compute.params.ext_remote_storage.is_none() {
if compute.ext_remote_storage.is_none() {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"remote storage is not configured",
@@ -46,9 +46,9 @@ pub(in crate::http) async fn download_extension(
remote_extensions.get_ext(
&filename,
ext_server_params.is_library,
&compute.params.build_tag,
&compute.params.pgversion,
params.is_library,
&compute.build_tag,
&compute.pgversion,
)
};

View File

@@ -5,62 +5,53 @@ use std::time::Duration;
use anyhow::Result;
use axum::Router;
use axum::middleware::{self};
use axum::response::IntoResponse;
use axum::extract::Request;
use axum::middleware::{self, Next};
use axum::response::{IntoResponse, Response};
use axum::routing::{get, post};
use compute_api::responses::ComputeCtlConfig;
use http::StatusCode;
use tokio::net::TcpListener;
use tower::ServiceBuilder;
use tower_http::{
auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
};
use tracing::{Span, error, info};
use tower_http::request_id::PropagateRequestIdLayer;
use tower_http::trace::TraceLayer;
use tracing::{Span, debug, error, info};
use uuid::Uuid;
use super::middleware::request_id::maybe_add_request_id_header;
use super::{
headers::X_REQUEST_ID,
middleware::authorize::Authorize,
routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, insights, metrics, metrics_json, status, terminate,
},
use super::routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, insights, metrics, metrics_json, status, terminate,
};
use crate::compute::ComputeNode;
const X_REQUEST_ID: &str = "x-request-id";
/// `compute_ctl` has two servers: internal and external. The internal server
/// binds to the loopback interface and handles communication from clients on
/// the compute. The external server is what receives communication from the
/// control plane, the metrics scraper, etc. We make the distinction because
/// certain routes in `compute_ctl` only need to be exposed to local processes
/// like Postgres via the neon extension and local_proxy.
#[derive(Clone, Debug)]
#[derive(Clone, Copy, Debug)]
pub enum Server {
Internal {
port: u16,
},
External {
port: u16,
config: ComputeCtlConfig,
compute_id: String,
},
Internal(u16),
External(u16),
}
impl Display for Server {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Server::Internal { .. } => f.write_str("internal"),
Server::External { .. } => f.write_str("external"),
Server::Internal(_) => f.write_str("internal"),
Server::External(_) => f.write_str("external"),
}
}
}
impl From<&Server> for Router<Arc<ComputeNode>> {
fn from(server: &Server) -> Self {
impl From<Server> for Router<Arc<ComputeNode>> {
fn from(server: Server) -> Self {
let mut router = Router::<Arc<ComputeNode>>::new();
router = match server {
Server::Internal { .. } => {
Server::Internal(_) => {
router = router
.route(
"/extension_server/{*filename}",
@@ -78,71 +69,59 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
router
}
Server::External {
config, compute_id, ..
} => {
let unauthenticated_router =
Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate))
.layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
compute_id.clone(),
config.jwks.clone(),
)));
router
.merge(unauthenticated_router)
.merge(authenticated_router)
}
Server::External(_) => router
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
.route("/metrics", get(metrics::get_metrics))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate)),
};
router
.fallback(Server::handle_404)
.method_not_allowed_fallback(Server::handle_405)
.layer(
ServiceBuilder::new()
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
// Add this middleware since we assume the request ID exists
.layer(middleware::from_fn(maybe_add_request_id_header))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {
let request_id = request
router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
ServiceBuilder::new()
// Add this middleware since we assume the request ID exists
.layer(middleware::from_fn(maybe_add_request_id_header))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {
let request_id = request
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
match request.uri().path() {
"/metrics" => {
debug!(%request_id, "{} {}", request.method(), request.uri())
}
_ => info!(%request_id, "{} {}", request.method(), request.uri()),
};
})
.on_response(
|response: &http::Response<_>, latency: Duration, _span: &Span| {
let request_id = response
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
info!(%request_id, "{} {}", request.method(), request.uri());
})
.on_response(
|response: &http::Response<_>, latency: Duration, _span: &Span| {
let request_id = response
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
info!(
%request_id,
code = response.status().as_u16(),
latency = latency.as_millis()
);
},
),
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
info!(
%request_id,
code = response.status().as_u16(),
latency = latency.as_millis()
)
},
),
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
}
}
@@ -166,15 +145,15 @@ impl Server {
match self {
// TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
// allow binding to localhost
Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
}
}
fn port(&self) -> u16 {
fn port(self) -> u16 {
match self {
Server::Internal { port, .. } => *port,
Server::External { port, .. } => *port,
Server::Internal(port) => port,
Server::External(port) => port,
}
}
@@ -201,9 +180,7 @@ impl Server {
);
}
let router = Router::from(&self)
.with_state(compute)
.into_make_service_with_connect_info::<SocketAddr>();
let router = Router::from(self).with_state(compute);
if let Err(e) = axum::serve(listener, router).await {
error!("compute_ctl {} HTTP server error: {}", self, e);
@@ -218,3 +195,15 @@ impl Server {
tokio::spawn(self.serve(state));
}
}
/// This middleware function allows compute_ctl to generate its own request ID
/// if one isn't supplied. The control plane will always send one as a UUID. The
/// neon Postgres extension on the other hand does not send one.
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
let headers = request.headers_mut();
if headers.get(X_REQUEST_ID).is_none() {
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
}
next.run(request).await
}

View File

@@ -2,7 +2,7 @@ use std::collections::HashMap;
use anyhow::Result;
use compute_api::responses::{InstalledExtension, InstalledExtensions};
use tokio_postgres::{Client, Config, NoTls};
use postgres::{Client, NoTls};
use crate::metrics::INSTALLED_EXTENSIONS;
@@ -10,7 +10,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
/// and to make database listing query here more explicit.
///
/// Limit the number of databases to 500 to avoid excessive load.
async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
// `pg_database.datconnlimit = -2` means that the database is in the
// invalid state
let databases = client
@@ -20,8 +20,7 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
AND datconnlimit <> - 2
LIMIT 500",
&[],
)
.await?
)?
.iter()
.map(|row| {
let db: String = row.get("datname");
@@ -37,36 +36,20 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
/// Same extension can be installed in multiple databases with different versions,
/// so we report a separate metric (number of databases where it is installed)
/// for each extension version.
pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
conf.application_name("compute_ctl:get_installed_extensions");
let databases: Vec<String> = {
let (mut client, connection) = conf.connect(NoTls).await?;
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
list_dbs(&mut client).await?
};
let mut client = conf.connect(NoTls)?;
let databases: Vec<String> = list_dbs(&mut client)?;
let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
for db in databases.iter() {
conf.dbname(db);
let (client, connection) = conf.connect(NoTls).await?;
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
let extensions: Vec<(String, String, i32)> = client
let mut db_client = conf.connect(NoTls)?;
let extensions: Vec<(String, String, i32)> = db_client
.query(
"SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
&[],
)
.await?
)?
.iter()
.map(|row| {
(

View File

@@ -21,9 +21,7 @@ mod migration;
pub mod monitor;
pub mod params;
pub mod pg_helpers;
pub mod rsyslog;
pub mod spec;
mod spec_apply;
pub mod swap;
pub mod sync_sk;
pub mod tls;

View File

@@ -1,5 +1,3 @@
use std::collections::HashMap;
use tracing::info;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
@@ -24,8 +22,7 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
.with_writer(std::io::stderr);
// Initialize OpenTelemetry
let otlp_layer =
tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()).await;
let otlp_layer = tracing_utils::init_tracing("compute_ctl").await;
// Put it all together
tracing_subscriber::registry()
@@ -45,50 +42,3 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
pub fn inlinify(s: &str) -> String {
s.replace('\n', "\u{200B}")
}
pub fn startup_context_from_env() -> Option<opentelemetry::Context> {
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
//
// This is used to propagate the context for the 'start_compute' operation
// from the neon control plane. This allows linking together the wider
// 'start_compute' operation that creates the compute container, with the
// startup actions here within the container.
//
// There is no standard for passing context in env variables, but a lot of
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
//
// Switch to the startup context here, and exit it once the startup has
// completed and Postgres is up and running.
//
// If this pod is pre-created without binding it to any particular endpoint
// yet, this isn't the right place to enter the startup context. In that
// case, the control plane should pass the tracing context as part of the
// /configure API call.
//
// NOTE: This is supposed to only cover the *startup* actions. Once
// postgres is configured and up-and-running, we exit this span. Any other
// actions that are performed on incoming HTTP requests, for example, are
// performed in separate spans.
//
// XXX: If the pod is restarted, we perform the startup actions in the same
// context as the original startup actions, which probably doesn't make
// sense.
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
if let Ok(val) = std::env::var("TRACEPARENT") {
startup_tracing_carrier.insert("traceparent".to_string(), val);
}
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry_sdk::propagation::TraceContextPropagator;
info!("got startup tracing context from env variables");
Some(TraceContextPropagator::new().extract(&startup_tracing_carrier))
} else {
None
}
}

View File

@@ -1,8 +1,6 @@
use metrics::core::{AtomicF64, Collector, GenericGauge};
use metrics::core::Collector;
use metrics::proto::MetricFamily;
use metrics::{
IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec,
};
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
use once_cell::sync::Lazy;
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
@@ -56,16 +54,9 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
register_int_counter_vec!(
"compute_ctl_remote_ext_requests_total",
"Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
&["http_status", "filename"]
)
.expect("failed to define a metric")
});
// Size of audit log directory in bytes
pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
register_gauge!(
"compute_audit_log_dir_size",
"Size of audit log directory in bytes",
// Do not use any labels like extension name yet.
// We can add them later if needed.
&["http_status"]
)
.expect("failed to define a metric")
});
@@ -75,6 +66,5 @@ pub fn collect() -> Vec<MetricFamily> {
metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
metrics.extend(DB_MIGRATION_FAILED.collect());
metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
metrics
}

View File

@@ -18,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
// should be handled gracefully.
fn watch_compute_activity(compute: &ComputeNode) {
// Suppose that `connstr` doesn't change
let connstr = compute.params.connstr.clone();
let connstr = compute.connstr.clone();
let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
// During startup and configuration we connect to every Postgres database,

View File

@@ -10,10 +10,8 @@ use std::str::FromStr;
use std::time::{Duration, Instant};
use anyhow::{Result, bail};
use compute_api::responses::TlsConfig;
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
use futures::StreamExt;
use indexmap::IndexMap;
use ini::Ini;
use notify::{RecursiveMode, Watcher};
use postgres::config::Config;
@@ -188,40 +186,15 @@ impl DatabaseExt for Database {
/// Postgres SQL queries and DATABASE_URL.
pub trait Escaping {
fn pg_quote(&self) -> String;
fn pg_quote_dollar(&self) -> (String, String);
}
impl Escaping for PgIdent {
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
/// always quotes provided string with `""` and escapes every `"`.
/// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
/// N.B. it's not useful for escaping identifiers that are used inside WHERE
/// clause, use `escape_literal()` instead.
fn pg_quote(&self) -> String {
format!("\"{}\"", self.replace('"', "\"\""))
}
/// This helper is intended to be used for dollar-escaping strings for usage
/// inside PL/pgSQL procedures. In addition to dollar-escaping the string,
/// it also returns a tag that is intended to be used inside the outer
/// PL/pgSQL procedure. If you do not need an outer tag, just discard it.
/// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`,
/// <https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L2924>
fn pg_quote_dollar(&self) -> (String, String) {
let mut tag: String = "".to_string();
let mut outer_tag = "x".to_string();
// Find the first suitable tag that is not present in the string.
// Postgres' max role/DB name length is 63 bytes, so even in the
// worst case it won't take long.
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
tag += "x";
outer_tag = tag.clone() + "x";
}
let escaped = format!("${tag}${self}${tag}$");
(escaped, outer_tag)
let result = format!("\"{}\"", self.replace('"', "\"\""));
result
}
}
@@ -253,13 +226,10 @@ pub async fn get_existing_dbs_async(
// invalid state. See:
// https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
let rowstream = client
// We use a subquery instead of a fancy `datdba::regrole::text AS owner`,
// because the latter automatically wraps the result in double quotes,
// if the role name contains special characters.
.query_raw::<str, &String, &[String; 0]>(
"SELECT
datname AS name,
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
datdba::regrole::text AS owner,
NOT datallowconn AS restrict_conn,
datconnlimit = - 2 AS invalid
FROM
@@ -408,7 +378,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
/// Update pgbouncer.ini with provided options
fn update_pgbouncer_ini(
pgbouncer_config: IndexMap<String, String>,
pgbouncer_config: HashMap<String, String>,
pgbouncer_ini_path: &str,
) -> Result<()> {
let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
@@ -429,10 +399,7 @@ fn update_pgbouncer_ini(
/// Tune pgbouncer.
/// 1. Apply new config using pgbouncer admin console
/// 2. Add new values to pgbouncer.ini to preserve them after restart
pub async fn tune_pgbouncer(
mut pgbouncer_config: IndexMap<String, String>,
tls_config: Option<TlsConfig>,
) -> Result<()> {
pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
// for VMs use pgbouncer specific way to connect to
// pgbouncer admin console without password
@@ -478,21 +445,19 @@ pub async fn tune_pgbouncer(
}
};
if let Some(tls_config) = tls_config {
// pgbouncer starts in a half-ok state if it cannot find these files.
// It will default to client_tls_sslmode=deny, which causes proxy to error.
// There is a small window at startup where these files don't yet exist in the VM.
// Best to wait until it exists.
loop {
if let Ok(true) = tokio::fs::try_exists(&tls_config.key_path).await {
break;
}
tokio::time::sleep(Duration::from_millis(500)).await
}
// Apply new config
for (option_name, value) in pgbouncer_config.iter() {
let query = format!("SET {}={}", option_name, value);
// keep this log line for debugging purposes
info!("Applying pgbouncer setting change: {}", query);
pgbouncer_config.insert("client_tls_cert_file".to_string(), tls_config.cert_path);
pgbouncer_config.insert("client_tls_key_file".to_string(), tls_config.key_path);
pgbouncer_config.insert("client_tls_sslmode".to_string(), "allow".to_string());
if let Err(err) = client.simple_query(&query).await {
// Don't fail on error, just print it into log
error!(
"Failed to apply pgbouncer setting change: {}, {}",
query, err
);
};
}
// save values to pgbouncer.ini
@@ -508,13 +473,6 @@ pub async fn tune_pgbouncer(
};
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
info!("Applying pgbouncer setting change");
if let Err(err) = client.simple_query("RELOAD").await {
// Don't fail on error, just print it into log
error!("Failed to apply pgbouncer setting change, {err}",);
};
Ok(())
}

View File

@@ -1,138 +0,0 @@
use std::fs;
use std::path::Path;
use std::process::Command;
use std::time::Duration;
use std::{fs::OpenOptions, io::Write};
use anyhow::{Context, Result};
use tracing::{error, info, instrument, warn};
fn get_rsyslog_pid() -> Option<String> {
let output = Command::new("pgrep")
.arg("rsyslogd")
.output()
.expect("Failed to execute pgrep");
if !output.stdout.is_empty() {
let pid = std::str::from_utf8(&output.stdout)
.expect("Invalid UTF-8 in process output")
.trim()
.to_string();
Some(pid)
} else {
None
}
}
// Restart rsyslogd to apply the new configuration.
// This is necessary, because there is no other way to reload the rsyslog configuration.
//
// Rsyslogd shouldn't lose any messages, because of the restart,
// because it tracks the last read position in the log files
// and will continue reading from that position.
// TODO: test it properly
//
fn restart_rsyslog() -> Result<()> {
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
info!("rsyslogd is running with pid: {}, restart it", old_pid);
// kill it to restart
let _ = Command::new("pkill")
.arg("rsyslogd")
.output()
.context("Failed to stop rsyslogd")?;
Ok(())
}
pub fn configure_audit_rsyslog(
log_directory: String,
tag: &str,
remote_endpoint: &str,
) -> Result<()> {
let config_content: String = format!(
include_str!("config_template/compute_audit_rsyslog_template.conf"),
log_directory = log_directory,
tag = tag,
remote_endpoint = remote_endpoint
);
info!("rsyslog config_content: {}", config_content);
let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf";
let mut file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(rsyslog_conf_path)?;
file.write_all(config_content.as_bytes())?;
info!(
"rsyslog configuration file {} added successfully. Starting rsyslogd",
rsyslog_conf_path
);
// start the service, using the configuration
restart_rsyslog()?;
Ok(())
}
#[instrument(skip_all)]
async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> {
info!("running pgaudit GC main loop");
loop {
// Check log_directory for old pgaudit logs and delete them.
// New log files are checked every 5 minutes, as set in pgaudit.log_rotation_age
// Find files that were not modified in the last 15 minutes and delete them.
// This should be enough time for rsyslog to process the logs and for us to catch the alerts.
//
// In case of a very high load, we might need to adjust this value and pgaudit.log_rotation_age.
//
// TODO: add some smarter logic to delete the files that are fully streamed according to rsyslog
// imfile-state files, but for now just do a simple GC to avoid filling up the disk.
let _ = Command::new("find")
.arg(&log_directory)
.arg("-name")
.arg("audit*.log")
.arg("-mmin")
.arg("+15")
.arg("-delete")
.output()?;
// also collect the metric for the size of the log directory
async fn get_log_files_size(path: &Path) -> Result<u64> {
let mut total_size = 0;
for entry in fs::read_dir(path)? {
let entry = entry?;
let entry_path = entry.path();
if entry_path.is_file() && entry_path.to_string_lossy().ends_with("log") {
total_size += entry.metadata()?.len();
}
}
Ok(total_size)
}
let log_directory_size = get_log_files_size(Path::new(&log_directory))
.await
.unwrap_or_else(|e| {
warn!("Failed to get log directory size: {}", e);
0
});
crate::metrics::AUDIT_LOG_DIR_SIZE.set(log_directory_size as f64);
tokio::time::sleep(Duration::from_secs(60)).await;
}
}
// launch pgaudit GC thread to clean up the old pgaudit logs stored in the log_directory
pub fn launch_pgaudit_gc(log_directory: String) {
tokio::spawn(async move {
if let Err(e) = pgaudit_gc_main_loop(log_directory).await {
error!("pgaudit GC main loop failed: {}", e);
}
});
}

View File

@@ -6,22 +6,21 @@ use std::sync::Arc;
use anyhow::{Context, Result};
use compute_api::responses::ComputeStatus;
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role};
use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
use futures::future::join_all;
use tokio::sync::RwLock;
use tokio_postgres::Client;
use tokio_postgres::error::SqlState;
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
use crate::compute::{ComputeNode, ComputeState};
use crate::compute::{ComputeNode, ComputeState, construct_superuser_query};
use crate::pg_helpers::{
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async,
get_existing_roles_async,
};
use crate::spec_apply::ApplySpecPhase::{
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
RunInEachDatabase,
};
@@ -188,7 +187,7 @@ impl ComputeNode {
}
for phase in [
CreateNeonSuperuser,
CreateSuperUser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -278,19 +277,6 @@ impl ComputeNode {
phases.push(FinalizeDropLogicalSubscriptions);
}
// Keep DisablePostgresDBPgAudit phase at the end,
// so that all config operations are audit logged.
match spec.audit_log_level
{
ComputeAudit::Hipaa => {
phases.push(CreatePgauditExtension);
phases.push(CreatePgauditlogtofileExtension);
phases.push(DisablePostgresDBPgAudit);
}
ComputeAudit::Log => { /* not implemented yet */ }
ComputeAudit::Disabled => {}
}
for phase in phases {
debug!("Applying phase {:?}", &phase);
apply_operations(
@@ -469,7 +455,7 @@ pub enum PerDatabasePhase {
#[derive(Clone, Debug)]
pub enum ApplySpecPhase {
CreateNeonSuperuser,
CreateSuperUser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -477,9 +463,6 @@ pub enum ApplySpecPhase {
CreateAndAlterDatabases,
CreateSchemaNeon,
RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
CreatePgauditExtension,
CreatePgauditlogtofileExtension,
DisablePostgresDBPgAudit,
HandleOtherExtensions,
HandleNeonExtension,
CreateAvailabilityCheck,
@@ -596,10 +579,14 @@ async fn get_operations<'a>(
apply_spec_phase: &'a ApplySpecPhase,
) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
match apply_spec_phase {
ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
query: include_str!("sql/create_neon_superuser.sql").to_string(),
comment: None,
}))),
ApplySpecPhase::CreateSuperUser => {
let query = construct_superuser_query(spec);
Ok(Box::new(once(Operation {
query,
comment: None,
})))
}
ApplySpecPhase::DropInvalidDatabases => {
let mut ctx = ctx.write().await;
let databases = &mut ctx.dbs;
@@ -733,15 +720,14 @@ async fn get_operations<'a>(
// We do not check whether the DB exists or not,
// Postgres will take care of it for us
"delete_db" => {
let (db_name, outer_tag) = op.name.pg_quote_dollar();
// In Postgres we can't drop a database if it is a template.
// So we need to unset the template flag first, but it could
// be a retry, so we could've already dropped the database.
// Check that database exists first to make it idempotent.
let unset_template_query: String = format!(
include_str!("sql/unset_template_for_drop_dbs.sql"),
datname = db_name,
outer_tag = outer_tag,
datname_str = escape_literal(&op.name),
datname = &op.name.pg_quote()
);
// Use FORCE to drop database even if there are active connections.
@@ -848,8 +834,6 @@ async fn get_operations<'a>(
comment: None,
},
Operation {
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
query: format!(
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
db.name.pg_quote()
@@ -909,11 +893,9 @@ async fn get_operations<'a>(
PerDatabasePhase::DropLogicalSubscriptions => {
match &db {
DB::UserDB(db) => {
let (db_name, outer_tag) = db.name.pg_quote_dollar();
let drop_subscription_query: String = format!(
include_str!("sql/drop_subscriptions.sql"),
datname_str = db_name,
outer_tag = outer_tag,
datname_str = escape_literal(&db.name),
);
let operations = vec![Operation {
@@ -952,7 +934,6 @@ async fn get_operations<'a>(
DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
DB::UserDB(db) => db.owner.pg_quote(),
};
let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
Some(vec![
// This will reassign all dependent objects to the db owner
@@ -967,9 +948,7 @@ async fn get_operations<'a>(
Operation {
query: format!(
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
role_name = escaped_role,
outer_tag = outer_tag,
role_name = quoted,
),
comment: None,
},
@@ -994,14 +973,12 @@ async fn get_operations<'a>(
DB::SystemDB => return Ok(Box::new(empty())),
DB::UserDB(db) => db,
};
let (db_owner, outer_tag) = db.owner.pg_quote_dollar();
let operations = vec![
Operation {
query: format!(
include_str!("sql/set_public_schema_owner.sql"),
db_owner = db_owner,
outer_tag = outer_tag,
db_owner = db.owner.pg_quote()
),
comment: None,
},
@@ -1121,25 +1098,6 @@ async fn get_operations<'a>(
}
Ok(Box::new(empty()))
}
ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
comment: Some(String::from("create pgaudit extensions")),
}))),
ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
comment: Some(String::from("create pgauditlogtofile extensions")),
}))),
// Disable pgaudit logging for postgres database.
// Postgres is neon system database used by monitors
// and compute_ctl tuning functions and thus generates a lot of noise.
// We do not consider data stored in this database as sensitive.
ApplySpecPhase::DisablePostgresDBPgAudit => {
let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'";
Ok(Box::new(once(Operation {
query: query.to_string(),
comment: Some(query.to_string()),
})))
}
ApplySpecPhase::HandleNeonExtension => {
let operations = vec![
Operation {

View File

@@ -1,8 +0,0 @@
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
END IF;
END
$$;

View File

@@ -1,4 +1,4 @@
DO ${outer_tag}$
DO $$
DECLARE
subname TEXT;
BEGIN
@@ -9,4 +9,4 @@ BEGIN
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
END LOOP;
END;
${outer_tag}$;
$$;

View File

@@ -17,5 +17,5 @@ BEGIN
INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
ON CONFLICT (id) DO UPDATE
SET timeline_id = current_setting('neon.timeline_id');
END;
END
$$

View File

@@ -1,7 +1,8 @@
DO ${outer_tag}$
SET SESSION ROLE neon_superuser;
DO $$
DECLARE
schema TEXT;
grantor TEXT;
revoke_query TEXT;
BEGIN
FOR schema IN
@@ -14,25 +15,14 @@ BEGIN
-- ii) it's easy to add more schemas to the list if needed.
WHERE schema_name IN ('public')
LOOP
FOR grantor IN EXECUTE
format(
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
quote_literal({role_name})
)
LOOP
EXECUTE format('SET LOCAL ROLE %I', grantor);
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
schema
);
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
schema,
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
{role_name},
grantor
);
EXECUTE revoke_query;
END LOOP;
EXECUTE revoke_query;
END LOOP;
END;
${outer_tag}$;
$$;
RESET ROLE;

View File

@@ -1,4 +1,5 @@
DO ${outer_tag}$
DO
$$
DECLARE
schema_owner TEXT;
BEGIN
@@ -15,8 +16,8 @@ DO ${outer_tag}$
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
THEN
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
ALTER SCHEMA public OWNER TO {db_owner};
END IF;
END IF;
END
${outer_tag}$;
$$;

View File

@@ -1,12 +1,12 @@
DO ${outer_tag}$
DO $$
BEGIN
IF EXISTS(
SELECT 1
FROM pg_catalog.pg_database
WHERE datname = {datname}
WHERE datname = {datname_str}
)
THEN
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
ALTER DATABASE {datname} is_template false;
END IF;
END
${outer_tag}$;
$$;

View File

@@ -1,118 +0,0 @@
use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration};
use anyhow::{Context, Result, bail};
use compute_api::responses::TlsConfig;
use ring::digest;
use spki::ObjectIdentifier;
use spki::der::{Decode, PemReader};
use x509_cert::Certificate;
#[derive(Clone, Copy)]
pub struct CertDigest(digest::Digest);
pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver<CertDigest> {
let mut digest = compute_digest(&cert_path).await;
let (tx, rx) = tokio::sync::watch::channel(digest);
tokio::spawn(async move {
while !tx.is_closed() {
let new_digest = compute_digest(&cert_path).await;
if digest.0.as_ref() != new_digest.0.as_ref() {
digest = new_digest;
_ = tx.send(digest);
}
tokio::time::sleep(Duration::from_secs(60)).await
}
});
rx
}
async fn compute_digest(cert_path: &str) -> CertDigest {
loop {
match try_compute_digest(cert_path).await {
Ok(d) => break d,
Err(e) => {
tracing::error!("could not read cert file {e:?}");
tokio::time::sleep(Duration::from_secs(1)).await
}
}
}
}
async fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
let data = tokio::fs::read(cert_path).await?;
// sha256 is extremely collision resistent. can safely assume the digest to be unique
Ok(CertDigest(digest::digest(&digest::SHA256, &data)))
}
pub const SERVER_CRT: &str = "server.crt";
pub const SERVER_KEY: &str = "server.key";
pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) {
loop {
match try_update_key_path_blocking(pg_data, tls_config) {
Ok(()) => break,
Err(e) => {
tracing::error!("could not create key file {e:?}");
std::thread::sleep(Duration::from_secs(1))
}
}
}
}
// Postgres requires the keypath be "secure". This means
// 1. Owned by the postgres user.
// 2. Have permission 600.
fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> {
let key = std::fs::read_to_string(&tls_config.key_path)?;
let crt = std::fs::read_to_string(&tls_config.cert_path)?;
// to mitigate a race condition during renewal.
verify_key_cert(&key, &crt)?;
let mut key_file = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.mode(0o600)
.open(pg_data.join(SERVER_KEY))?;
let mut crt_file = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.mode(0o600)
.open(pg_data.join(SERVER_CRT))?;
key_file.write_all(key.as_bytes())?;
crt_file.write_all(crt.as_bytes())?;
Ok(())
}
fn verify_key_cert(key: &str, cert: &str) -> Result<()> {
const ECDSA_WITH_SHA256: ObjectIdentifier = ObjectIdentifier::new_unwrap("1.2.840.10045.4.3.2");
let cert = Certificate::decode(&mut PemReader::new(cert.as_bytes()).context("pem reader")?)
.context("decode cert")?;
match cert.signature_algorithm.oid {
ECDSA_WITH_SHA256 => {
let key = p256::SecretKey::from_sec1_pem(key).context("parse key")?;
let a = key.public_key().to_sec1_bytes();
let b = cert
.tbs_certificate
.subject_public_key_info
.subject_public_key
.raw_bytes();
if *a != *b {
bail!("private key file does not match certificate")
}
}
_ => bail!("unknown TLS key type"),
}
Ok(())
}

View File

@@ -61,23 +61,6 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
}
#[test]
fn ident_pg_quote_dollar() {
let test_cases = vec![
("name", ("$$name$$", "x")),
("name$$", ("$x$name$$$x$", "xx")),
("name$$$", ("$x$name$$$$x$", "xx")),
("name$$$$", ("$x$name$$$$$x$", "xx")),
("name$x$", ("$xx$name$x$$xx$", "xxx")),
];
for (input, expected) in test_cases {
let (escaped, tag) = PgIdent::from(input).pg_quote_dollar();
assert_eq!(escaped, expected.0);
assert_eq!(tag, expected.1);
}
}
#[test]
fn generic_options_search() {
let generic_options: GenericOptions = Some(vec![

View File

@@ -36,13 +36,10 @@ use pageserver_api::config::{
use pageserver_api::controller_api::{
NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest,
};
use pageserver_api::models::{
ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use safekeeper_api::membership::SafekeeperGeneration;
use safekeeper_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -599,15 +596,7 @@ struct EndpointStartCmdArgs {
#[clap(long = "pageserver-id")]
endpoint_pageserver_id: Option<NodeId>,
#[clap(
long,
help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
)]
safekeepers_generation: Option<u32>,
#[clap(
long,
help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
)]
#[clap(long)]
safekeepers: Option<String>,
#[clap(
@@ -628,9 +617,9 @@ struct EndpointStartCmdArgs {
)]
allow_multiple: bool,
#[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
#[arg(default_value = "90s")]
start_timeout: Duration,
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
}
#[derive(clap::Args)]
@@ -965,7 +954,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
id: pageserver_id,
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
listen_http_addr: format!("127.0.0.1:{http_port}"),
listen_https_addr: None,
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
other: Default::default(),
@@ -979,8 +967,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
neon_distrib_dir: None,
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
storage_controller: None,
control_plane_hooks_api: None,
generate_local_ssl_certs: false,
control_plane_compute_hook_api: None,
}
};
@@ -1131,16 +1118,12 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
let tenant_id = get_tenant_id(args.tenant_id, env)?;
let tenant_conf: HashMap<_, _> =
args.config.iter().flat_map(|c| c.split_once(':')).collect();
let config = PageServerNode::parse_config(tenant_conf)?;
let req = TenantConfigRequest { tenant_id, config };
let storage_controller = StorageController::from_env(env);
storage_controller
.set_tenant_config(&req)
pageserver
.tenant_config(tenant_id, tenant_conf)
.await
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
println!("tenant {tenant_id} successfully configured via storcon");
println!("tenant {tenant_id} successfully configured on the pageserver");
}
}
Ok(())
@@ -1367,7 +1350,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
let pageserver_id = args.endpoint_pageserver_id;
let remote_ext_config = &args.remote_ext_config;
let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
// If --safekeepers argument is given, use only the listed
// safekeeper nodes; otherwise all from the env.
let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
@@ -1443,13 +1425,11 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
endpoint
.start(
&auth_token,
safekeepers_generation,
safekeepers,
pageservers,
remote_ext_config.as_ref(),
stripe_size.0 as usize,
args.create_test_user,
args.start_timeout,
)
.await?;
}

View File

@@ -42,19 +42,17 @@ use std::path::PathBuf;
use std::process::Command;
use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use anyhow::{Context, Result, anyhow, bail};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
use compute_api::spec::{
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
RemoteExtSpec, Role,
Cluster, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role,
};
use nix::sys::signal::{Signal, kill};
use pageserver_api::shard::ShardStripeSize;
use reqwest::header::CONTENT_TYPE;
use safekeeper_api::membership::SafekeeperGeneration;
use serde::{Deserialize, Serialize};
use tracing::debug;
use url::Host;
@@ -578,17 +576,14 @@ impl Endpoint {
Ok(safekeeper_connstrings)
}
#[allow(clippy::too_many_arguments)]
pub async fn start(
&self,
auth_token: &Option<String>,
safekeepers_generation: Option<SafekeeperGeneration>,
safekeepers: Vec<NodeId>,
pageservers: Vec<(Host, u16)>,
remote_ext_config: Option<&String>,
shard_stripe_size: usize,
create_test_user: bool,
start_timeout: Duration,
) -> Result<()> {
if self.status() == EndpointStatus::Running {
anyhow::bail!("The endpoint is already running");
@@ -660,7 +655,6 @@ impl Endpoint {
timeline_id: Some(self.timeline_id),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
@@ -669,7 +663,6 @@ impl Endpoint {
local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency,
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
audit_log_level: ComputeAudit::Disabled,
};
// this strange code is needed to support respec() in tests
@@ -777,18 +770,17 @@ impl Endpoint {
std::fs::write(pidfile_path, pid.to_string())?;
// Wait for it to start
let mut attempt = 0;
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
let start_at = Instant::now();
const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
loop {
attempt += 1;
match self.get_status().await {
Ok(state) => {
match state.status {
ComputeStatus::Init => {
if Instant::now().duration_since(start_at) > start_timeout {
bail!(
"compute startup timed out {:?}; still in Init state",
start_timeout
);
if attempt == MAX_ATTEMPTS {
bail!("compute startup timed out; still in Init state");
}
// keep retrying
}
@@ -815,11 +807,8 @@ impl Endpoint {
}
}
Err(e) => {
if Instant::now().duration_since(start_at) > start_timeout {
return Err(e).context(format!(
"timed out {:?} waiting to connect to compute_ctl HTTP",
start_timeout,
));
if attempt == MAX_ATTEMPTS {
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
}
}
}

View File

@@ -72,19 +72,15 @@ pub struct LocalEnv {
// be propagated into each pageserver's configuration.
pub control_plane_api: Url,
// Control plane upcall APIs for storage controller. If set, this will be propagated into the
// Control plane upcall API for storage controller. If set, this will be propagated into the
// storage controller's configuration.
pub control_plane_hooks_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
/// Flag to generate SSL certificates for components that need it.
/// Also generates root CA certificate that is used to sign all other certificates.
pub generate_local_ssl_certs: bool,
}
/// On-disk state stored in `.neon/config`.
@@ -104,13 +100,8 @@ pub struct OnDiskConfig {
pub pageservers: Vec<PageServerConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
// Note: skip serializing because in compat tests old storage controller fails
// to load new config file. May be removed after this field is in release branch.
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub generate_local_ssl_certs: bool,
}
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
@@ -137,8 +128,7 @@ pub struct NeonLocalInitConf {
pub pageservers: Vec<NeonLocalInitPageserverConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub generate_local_ssl_certs: bool,
pub control_plane_compute_hook_api: Option<Option<Url>>,
}
/// Broker config for cluster internal communication.
@@ -149,7 +139,7 @@ pub struct NeonBroker {
pub listen_addr: SocketAddr,
}
/// A part of storage controller's config the neon_local knows about.
/// Broker config for cluster internal communication.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct NeonStorageControllerConf {
@@ -175,12 +165,6 @@ pub struct NeonStorageControllerConf {
#[serde(with = "humantime_serde")]
pub long_reconcile_threshold: Option<Duration>,
pub use_https_pageserver_api: bool,
pub timelines_onto_safekeepers: bool,
pub use_https_safekeeper_api: bool,
}
impl NeonStorageControllerConf {
@@ -204,9 +188,6 @@ impl Default for NeonStorageControllerConf {
max_secondary_lag_bytes: None,
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
long_reconcile_threshold: None,
use_https_pageserver_api: false,
timelines_onto_safekeepers: false,
use_https_safekeeper_api: false,
}
}
}
@@ -236,7 +217,6 @@ pub struct PageServerConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub no_sync: bool,
@@ -248,7 +228,6 @@ impl Default for PageServerConf {
id: NodeId(0),
listen_pg_addr: String::new(),
listen_http_addr: String::new(),
listen_https_addr: None,
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
no_sync: false,
@@ -264,7 +243,6 @@ pub struct NeonLocalInitPageserverConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
@@ -279,7 +257,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
id,
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -289,7 +266,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
id: *id,
listen_pg_addr: listen_pg_addr.clone(),
listen_http_addr: listen_http_addr.clone(),
listen_https_addr: listen_https_addr.clone(),
pg_auth_type: *pg_auth_type,
http_auth_type: *http_auth_type,
no_sync: *no_sync,
@@ -304,7 +280,6 @@ pub struct SafekeeperConf {
pub pg_port: u16,
pub pg_tenant_only_port: Option<u16>,
pub http_port: u16,
pub https_port: Option<u16>,
pub sync: bool,
pub remote_storage: Option<String>,
pub backup_threads: Option<u32>,
@@ -319,7 +294,6 @@ impl Default for SafekeeperConf {
pg_port: 0,
pg_tenant_only_port: None,
http_port: 0,
https_port: None,
sync: true,
remote_storage: None,
backup_threads: None,
@@ -436,41 +410,6 @@ impl LocalEnv {
}
}
pub fn ssl_ca_cert_path(&self) -> Option<PathBuf> {
if self.generate_local_ssl_certs {
Some(self.base_data_dir.join("rootCA.crt"))
} else {
None
}
}
pub fn ssl_ca_key_path(&self) -> Option<PathBuf> {
if self.generate_local_ssl_certs {
Some(self.base_data_dir.join("rootCA.key"))
} else {
None
}
}
pub fn generate_ssl_ca_cert(&self) -> anyhow::Result<()> {
let cert_path = self.ssl_ca_cert_path().unwrap();
let key_path = self.ssl_ca_key_path().unwrap();
if !fs::exists(cert_path.as_path())? {
generate_ssl_ca_cert(cert_path.as_path(), key_path.as_path())?;
}
Ok(())
}
pub fn generate_ssl_cert(&self, cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
self.generate_ssl_ca_cert()?;
generate_ssl_cert(
cert_path,
key_path,
self.ssl_ca_cert_path().unwrap().as_path(),
self.ssl_ca_key_path().unwrap().as_path(),
)
}
/// Inspect the base data directory and extract the instance id and instance directory path
/// for all storage controller instances
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
@@ -578,10 +517,8 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api,
control_plane_hooks_api,
control_plane_compute_hook_api: _,
control_plane_compute_hook_api,
branch_name_mappings,
generate_local_ssl_certs,
} = on_disk_config;
LocalEnv {
base_data_dir: repopath.to_owned(),
@@ -594,9 +531,8 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api: control_plane_api.unwrap(),
control_plane_hooks_api,
control_plane_compute_hook_api,
branch_name_mappings,
generate_local_ssl_certs,
}
};
@@ -632,7 +568,6 @@ impl LocalEnv {
struct PageserverConfigTomlSubset {
listen_pg_addr: String,
listen_http_addr: String,
listen_https_addr: Option<String>,
pg_auth_type: AuthType,
http_auth_type: AuthType,
#[serde(default)]
@@ -657,7 +592,6 @@ impl LocalEnv {
let PageserverConfigTomlSubset {
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -675,7 +609,6 @@ impl LocalEnv {
},
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -701,10 +634,8 @@ impl LocalEnv {
pageservers: vec![], // it's skip_serializing anyway
safekeepers: self.safekeepers.clone(),
control_plane_api: Some(self.control_plane_api.clone()),
control_plane_hooks_api: self.control_plane_hooks_api.clone(),
control_plane_compute_hook_api: None,
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
branch_name_mappings: self.branch_name_mappings.clone(),
generate_local_ssl_certs: self.generate_local_ssl_certs,
},
)
}
@@ -786,8 +717,7 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api,
generate_local_ssl_certs,
control_plane_hooks_api,
control_plane_compute_hook_api,
} = conf;
// Find postgres binaries.
@@ -834,24 +764,16 @@ impl LocalEnv {
pageservers: pageservers.iter().map(Into::into).collect(),
safekeepers,
control_plane_api: control_plane_api.unwrap(),
control_plane_hooks_api,
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
branch_name_mappings: Default::default(),
generate_local_ssl_certs,
};
if generate_local_ssl_certs {
env.generate_ssl_ca_cert()?;
}
// create endpoints dir
fs::create_dir_all(env.endpoints_path())?;
// create safekeeper dirs
for safekeeper in &env.safekeepers {
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
SafekeeperNode::from_env(&env, safekeeper)
.initialize()
.context("safekeeper init failed")?;
}
// initialize pageserver state
@@ -929,80 +851,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
}
Ok(())
}
fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
// openssl req -x509 -newkey rsa:2048 -nodes -subj "/CN=Neon Local CA" -days 36500 \
// -out rootCA.crt -keyout rootCA.key
let keygen_output = Command::new("openssl")
.args([
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
])
.args(["-subj", "/CN=Neon Local CA"])
.args(["-out", cert_path.to_str().unwrap()])
.args(["-keyout", key_path.to_str().unwrap()])
.output()
.context("failed to generate CA certificate")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
Ok(())
}
fn generate_ssl_cert(
cert_path: &Path,
key_path: &Path,
ca_cert_path: &Path,
ca_key_path: &Path,
) -> anyhow::Result<()> {
// Generate Certificate Signing Request (CSR).
let mut csr_path = cert_path.to_path_buf();
csr_path.set_extension(".csr");
// openssl req -new -nodes -newkey rsa:2048 -keyout server.key -out server.csr \
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
let keygen_output = Command::new("openssl")
.args(["req", "-new", "-nodes"])
.args(["-newkey", "rsa:2048"])
.args(["-subj", "/CN=localhost"])
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
.args(["-keyout", key_path.to_str().unwrap()])
.args(["-out", csr_path.to_str().unwrap()])
.output()
.context("failed to generate CSR")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
// Sign CSR with CA key.
//
// openssl x509 -req -in server.csr -CA rootCA.crt -CAkey rootCA.key -CAcreateserial \
// -out server.crt -days 36500 -copy_extensions copyall
let keygen_output = Command::new("openssl")
.args(["x509", "-req"])
.args(["-in", csr_path.to_str().unwrap()])
.args(["-CA", ca_cert_path.to_str().unwrap()])
.args(["-CAkey", ca_key_path.to_str().unwrap()])
.arg("-CAcreateserial")
.args(["-out", cert_path.to_str().unwrap()])
.args(["-days", "36500"])
.args(["-copy_extensions", "copyall"])
.output()
.context("failed to sign CSR")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
// Remove CSR file as it's not needed anymore.
fs::remove_file(csr_path)?;
Ok(())
}

View File

@@ -21,7 +21,6 @@ use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
use postgres_connection::{PgConnectionConfig, parse_host_port};
use reqwest::Certificate;
use utils::auth::{Claims, Scope};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -50,29 +49,12 @@ impl PageServerNode {
let (host, port) =
parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let port = port.unwrap_or(5432);
let ssl_ca_cert = env.ssl_ca_cert_path().map(|ssl_ca_file| {
let buf = std::fs::read(ssl_ca_file).expect("SSL root CA file should exist");
Certificate::from_pem(&buf).expect("CA certificate should be valid")
});
let endpoint = if env.storage_controller.use_https_pageserver_api {
format!(
"https://{}",
conf.listen_https_addr.as_ref().expect(
"listen https address should be specified if use_https_pageserver_api is on"
)
)
} else {
format!("http://{}", conf.listen_http_addr)
};
Self {
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
conf: conf.clone(),
env: env.clone(),
http_client: mgmt_api::Client::new(
endpoint,
format!("http://{}", conf.listen_http_addr),
{
match conf.http_auth_type {
AuthType::Trust => None,
@@ -83,9 +65,7 @@ impl PageServerNode {
}
}
.as_deref(),
ssl_ca_cert,
)
.expect("Client constructs with no errors"),
),
}
}
@@ -240,13 +220,6 @@ impl PageServerNode {
.context("write identity toml")?;
drop(identity_toml);
if self.env.generate_local_ssl_certs {
self.env.generate_ssl_cert(
datadir.join("server.crt").as_path(),
datadir.join("server.key").as_path(),
)?;
}
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
// Write metadata file, used by pageserver on startup to register itself with
@@ -257,15 +230,6 @@ impl PageServerNode {
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
let http_port = http_port.unwrap_or(9898);
let https_port = match self.conf.listen_https_addr.as_ref() {
Some(https_addr) => {
let (_https_host, https_port) =
parse_host_port(https_addr).expect("Unable to parse listen_https_addr");
Some(https_port.unwrap_or(9899))
}
None => None,
};
// Intentionally hand-craft JSON: this acts as an implicit format compat test
// in case the pageserver-side structure is edited, and reflects the real life
// situation: the metadata is written by some other script.
@@ -276,7 +240,6 @@ impl PageServerNode {
postgres_port: self.pg_connection_config.port(),
http_host: "localhost".to_string(),
http_port,
https_port,
other: HashMap::from([(
"availability_zone_id".to_string(),
serde_json::json!(az_id),

View File

@@ -111,18 +111,6 @@ impl SafekeeperNode {
.expect("non-Unicode path")
}
/// Initializes a safekeeper node by creating all necessary files,
/// e.g. SSL certificates.
pub fn initialize(&self) -> anyhow::Result<()> {
if self.env.generate_local_ssl_certs {
self.env.generate_ssl_cert(
&self.datadir_path().join("server.crt"),
&self.datadir_path().join("server.key"),
)?;
}
Ok(())
}
pub async fn start(
&self,
extra_opts: &[String],
@@ -208,16 +196,6 @@ impl SafekeeperNode {
]);
}
if let Some(https_port) = self.conf.https_port {
args.extend([
"--listen-https".to_owned(),
format!("{}:{}", self.listen_addr, https_port),
]);
}
if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
}
args.extend_from_slice(extra_opts);
background_process::start_process(

View File

@@ -12,10 +12,13 @@ use hyper0::Uri;
use nix::unistd::Pid;
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
TenantCreateResponse, TenantLocateResponse,
TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
TenantShardMigrateResponse,
};
use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
use pageserver_api::shard::TenantShardId;
use pageserver_api::models::{
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use reqwest::Method;
@@ -534,18 +537,6 @@ impl StorageController {
args.push("--start-as-candidate".to_string());
}
if self.config.use_https_pageserver_api {
args.push("--use-https-pageserver-api".to_string());
}
if self.config.use_https_safekeeper_api {
args.push("--use-https-safekeeper-api".to_string());
}
if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
}
if let Some(private_key) = &self.private_key {
let claims = Claims::new(None, Scope::PageServerApi);
let jwt_token =
@@ -562,8 +553,10 @@ impl StorageController {
args.push(format!("--public-key=\"{public_key}\""));
}
if let Some(control_plane_hooks_api) = &self.env.control_plane_hooks_api {
args.push(format!("--control-plane-url={control_plane_hooks_api}"));
if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
args.push(format!(
"--compute-hook-url={control_plane_compute_hook_api}"
));
}
if let Some(split_threshold) = self.config.split_threshold.as_ref() {
@@ -586,10 +579,6 @@ impl StorageController {
self.env.base_data_dir.display()
));
if self.config.timelines_onto_safekeepers {
args.push("--timelines-onto-safekeepers".to_string());
}
background_process::start_process(
COMMAND,
&instance_dir,
@@ -836,6 +825,41 @@ impl StorageController {
.await
}
#[instrument(skip(self))]
pub async fn tenant_migrate(
&self,
tenant_shard_id: TenantShardId,
node_id: NodeId,
) -> anyhow::Result<TenantShardMigrateResponse> {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(TenantShardMigrateRequest {
node_id,
migration_config: None,
}),
)
.await
}
#[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
pub async fn tenant_split(
&self,
tenant_id: TenantId,
new_shard_count: u8,
new_stripe_size: Option<ShardStripeSize>,
) -> anyhow::Result<TenantShardSplitResponse> {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/shard_split"),
Some(TenantShardSplitRequest {
new_shard_count,
new_stripe_size,
}),
)
.await
}
#[instrument(skip_all, fields(node_id=%req.node_id))]
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
@@ -880,9 +904,4 @@ impl StorageController {
)
.await
}
pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> anyhow::Result<()> {
self.dispatch(Method::PUT, "v1/tenant/config".to_string(), Some(req))
.await
}
}

View File

@@ -1,21 +1,20 @@
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
use clap::{Parser, Subcommand};
use futures::StreamExt;
use pageserver_api::controller_api::{
AvailabilityZone, MigrationConfig, NodeAvailabilityWrapper, NodeConfigureRequest,
NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse,
PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
TenantShardMigrateRequest, TenantShardMigrateResponse,
AvailabilityZone, NodeAvailabilityWrapper, NodeConfigureRequest, NodeDescribeResponse,
NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse, PlacementPolicy,
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest,
TenantDescribeResponse, TenantPolicyRequest, TenantShardMigrateRequest,
TenantShardMigrateResponse,
};
use pageserver_api::models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest,
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters,
TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest,
TenantShardSplitResponse,
};
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
@@ -113,15 +112,6 @@ enum Command {
tenant_shard_id: TenantShardId,
#[arg(long)]
node: NodeId,
#[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
prewarm: bool,
#[arg(long, default_value_t = false, action = clap::ArgAction::Set)]
override_scheduler: bool,
},
/// Watch the location of a tenant shard evolve, e.g. while expecting it to migrate
TenantShardWatch {
#[arg(long)]
tenant_shard_id: TenantShardId,
},
/// Migrate the secondary location for a tenant shard to a specific pageserver.
TenantShardMigrateSecondary {
@@ -158,6 +148,12 @@ enum Command {
#[arg(long)]
tenant_id: TenantId,
},
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
/// mode so that it can warm up content on a pageserver.
TenantWarmup {
#[arg(long)]
tenant_id: TenantId,
},
TenantSetPreferredAz {
#[arg(long)]
tenant_id: TenantId,
@@ -273,10 +269,6 @@ struct Cli {
/// a token with both scopes to use with this tool.
jwt: Option<String>,
#[arg(long)]
/// Trusted root CA certificate to use in https APIs.
ssl_ca_file: Option<PathBuf>,
#[command(subcommand)]
command: Command,
}
@@ -387,17 +379,9 @@ async fn main() -> anyhow::Result<()> {
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
let ssl_ca_cert = match &cli.ssl_ca_file {
Some(ssl_ca_file) => {
let buf = tokio::fs::read(ssl_ca_file).await?;
Some(reqwest::Certificate::from_pem(&buf)?)
}
None => None,
};
let mut trimmed = cli.api.to_string();
trimmed.pop();
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref(), ssl_ca_cert)?;
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
match cli.command {
Command::NodeRegister {
@@ -635,43 +619,19 @@ async fn main() -> anyhow::Result<()> {
Command::TenantShardMigrate {
tenant_shard_id,
node,
prewarm,
override_scheduler,
} => {
let migration_config = MigrationConfig {
prewarm,
override_scheduler,
..Default::default()
};
let req = TenantShardMigrateRequest {
node_id: node,
origin_node_id: None,
migration_config,
migration_config: None,
};
match storcon_client
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(req),
)
.await
{
Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) => {
anyhow::bail!(
"Migration to {node} rejected, may require `--force` ({}) ",
msg
);
}
Err(e) => return Err(e.into()),
Ok(_) => {}
}
watch_tenant_shard(storcon_client, tenant_shard_id, Some(node)).await?;
}
Command::TenantShardWatch { tenant_shard_id } => {
watch_tenant_shard(storcon_client, tenant_shard_id, None).await?;
.await?;
}
Command::TenantShardMigrateSecondary {
tenant_shard_id,
@@ -679,8 +639,7 @@ async fn main() -> anyhow::Result<()> {
} => {
let req = TenantShardMigrateRequest {
node_id: node,
origin_node_id: None,
migration_config: MigrationConfig::default(),
migration_config: None,
};
storcon_client
@@ -865,6 +824,94 @@ async fn main() -> anyhow::Result<()> {
)
.await?;
}
Command::TenantWarmup { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await;
match describe_response {
Ok(describe) => {
if matches!(describe.policy, PlacementPolicy::Secondary) {
// Fine: it's already known to controller in secondary mode: calling
// again to put it into secondary mode won't cause problems.
} else {
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
}
}
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
// Fine: this tenant isn't know to the storage controller yet.
}
Err(e) => {
// Unexpected API error
return Err(e.into());
}
}
vps_client
.location_config(
TenantShardId::unsharded(tenant_id),
pageserver_api::models::LocationConfig {
mode: pageserver_api::models::LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: 0,
shard_count: 0,
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
tenant_conf: TenantConfig::default(),
},
None,
true,
)
.await?;
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let secondary_ps_id = describe_response
.shards
.first()
.unwrap()
.node_secondary
.first()
.unwrap();
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
loop {
let (status, progress) = vps_client
.tenant_secondary_download(
TenantShardId::unsharded(tenant_id),
Some(Duration::from_secs(10)),
)
.await?;
println!(
"Progress: {}/{} layers, {}/{} bytes",
progress.layers_downloaded,
progress.layers_total,
progress.bytes_downloaded,
progress.bytes_total
);
match status {
StatusCode::OK => {
println!("Download complete");
break;
}
StatusCode::ACCEPTED => {
// Loop
}
_ => {
anyhow::bail!("Unexpected download status: {status}");
}
}
}
}
Command::TenantDrop { tenant_id, unclean } => {
if !unclean {
anyhow::bail!(
@@ -1058,8 +1105,7 @@ async fn main() -> anyhow::Result<()> {
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
Some(TenantShardMigrateRequest {
node_id: mv.to,
origin_node_id: Some(mv.from),
migration_config: MigrationConfig::default(),
migration_config: None,
}),
)
.await
@@ -1238,68 +1284,3 @@ async fn main() -> anyhow::Result<()> {
Ok(())
}
static WATCH_INTERVAL: Duration = Duration::from_secs(5);
async fn watch_tenant_shard(
storcon_client: Client,
tenant_shard_id: TenantShardId,
until_migrated_to: Option<NodeId>,
) -> anyhow::Result<()> {
if let Some(until_migrated_to) = until_migrated_to {
println!(
"Waiting for tenant shard {} to be migrated to node {}",
tenant_shard_id, until_migrated_to
);
}
loop {
let desc = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{}", tenant_shard_id.tenant_id),
None,
)
.await?;
// Output the current state of the tenant shard
let shard = desc
.shards
.iter()
.find(|s| s.tenant_shard_id == tenant_shard_id)
.ok_or(anyhow::anyhow!("Tenant shard not found"))?;
let summary = format!(
"attached: {} secondary: {} {}",
shard
.node_attached
.map(|n| format!("{}", n))
.unwrap_or("none".to_string()),
shard
.node_secondary
.iter()
.map(|n| n.to_string())
.collect::<Vec<_>>()
.join(","),
if shard.is_reconciling {
"(reconciler active)"
} else {
"(reconciler idle)"
}
);
println!("{}", summary);
// Maybe drop out if we finished migration
if let Some(until_migrated_to) = until_migrated_to {
if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling {
println!(
"Tenant shard {} is now on node {}",
tenant_shard_id, until_migrated_to
);
break;
}
}
tokio::time::sleep(WATCH_INTERVAL).await;
}
Ok(())
}

View File

@@ -27,10 +27,6 @@ yanked = "warn"
id = "RUSTSEC-2023-0071"
reason = "the marvin attack only affects private key decryption, not public key signature verification"
[[advisories.ignore]]
id = "RUSTSEC-2024-0436"
reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact."
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html

View File

@@ -6,11 +6,8 @@ generate_id() {
local -n resvar=$1
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
}
echo "${OLD_COMPUTE_TAG}"
echo "${NEW_COMPUTE_TAG}"
echo "${TEST_EXTENSIONS_TAG}"
if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then
echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set
if [ -z ${OLD_COMPUTE_TAG+x} ] || [ -z ${NEW_COMPUTE_TAG+x} ] || [ -z "${OLD_COMPUTE_TAG}" ] || [ -z "${NEW_COMPUTE_TAG}" ]; then
echo OLD_COMPUTE_TAG and NEW_COMPUTE_TAG must be defined
exit 1
fi
export PG_VERSION=${PG_VERSION:-16}
@@ -85,7 +82,7 @@ EXTENSIONS='[
{"extname": "pg_repack", "extdir": "pg_repack-src"}
]'
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
COMPUTE_TAG=${NEW_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
wait_for_ready
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
@@ -93,7 +90,7 @@ create_extensions "${EXTNAMES}"
query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
docker compose --profile test-extensions down
COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
COMPUTE_TAG=${OLD_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
wait_for_ready
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"

View File

@@ -1,201 +0,0 @@
# Sparse Keyspace for Relation Directories
## Summary
This is an RFC describing a new storage strategy for storing relation directories.
## Motivation
Postgres maintains a directory structure for databases and relations. In Neon, we store these information
by serializing the directory data in a single key (see `pgdatadir_mapping.rs`).
```rust
// DbDir:
// 00 00000000 00000000 00000000 00 00000000
// RelDir:
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
```
We have a dedicated structure on the ingestion path to serialize the relation directory into this single key.
```rust
#[derive(Debug, Serialize, Deserialize, Default)]
pub(crate) struct RelDirectory {
// Set of relations that exist. (relfilenode, forknum)
//
// TODO: Store it as a btree or radix tree or something else that spans multiple
// key-value pairs, if you have a lot of relations
pub(crate) rels: HashSet<(Oid, u8)>,
}
```
The current codebase has the following three access patterns for the relation directory.
1. Check if a relation exists.
2. List all relations.
3. Create/drop a relation.
For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the
hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get
and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back.
If we have 100k relations in a database, we would have a 100k-large hash set. Then, every
relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the
relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path,
we would have to deserialize this super big 100k-large key before checking if a single relation exists.
In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how
to seamlessly migrate users to use the new keyspace.
The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316).
## Key Mapping
We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in
[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>`
for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`),
into the key.
```plain
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists
```
Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be
implemented as follows.
1. Check if a relation exists: check if the key maps to "exists".
2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key.
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will
be removed during image layer generation upon compaction.
Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum.
The mapping is implemented as `rel_tag_sparse_key` in the PoC patch.
## Changes to Sparse Keyspace
Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir
information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs
to be updated accordingly to accommodate such "inherited sparse keys". This is done in
[PR#10313](https://github.com/neondatabase/neon/pull/10313).
## Coexistence of the Old and New Keyspaces
Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the
ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read
path needs to combine the data from both keyspaces.
Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the
new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration
process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the
migration can happen seamlessly and imposes no potential downtime for the user.
With the coexistence assumption, the 3 reldir operations will be implemented as follows:
1. Check if a relation exists
- Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly
return it to the user.
- Otherwise, deserialize the old reldir key and get the result.
2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key.
Combine them to obtain the final result.
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace.
- We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check.
- For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace.
- For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key,
remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace.
- The delete tombstone will be removed during image layer generation upon compaction.
This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total
amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction.
There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal
with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives
us `O(1)` complexity after fully opt-in the sparse keyspace.
The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible
to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN.
We will introduce a config item and an index_part record to record the current status of the migration process.
- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace.
- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace.
If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update
`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to
`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still
read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only:
once v2 is enabled, the user cannot go back to v1.
## Next Steps
### Full Migration
This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and
v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this
code path, we must ensure the timeline has no old reldir data.
We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces:
the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while
copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in
the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers`
process discovers the following keys at this LSN.
```plain
db1/reldir_key -> (table 1, table 2, table 3)
...db1 rel keys
db2/reldir_key -> (table 4, table 5, table 6)
...db2 rel keys
sparse_reldir_db2_table7 -> exists
sparse_reldir_db1_table8 -> deleted
```
It will generate the following keys:
```plain
db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`.
...db1 rel keys
db2/reldir_key -> ()
...db2 rel keys
-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180
sparse_reldir_db1_table1 -> exists
sparse_reldir_db1_table2 -> exists
sparse_reldir_db1_table3 -> exists
sparse_reldir_db2_table4 -> exists
sparse_reldir_db2_table5 -> exists
sparse_reldir_db2_table6 -> exists
sparse_reldir_db2_table7 -> exists
-- end image layer for the sparse keyspace at sparse_reldir_prefix+1
# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace.
# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there
# are no correctness issue.
```
We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before
we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images
above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or
in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to
`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we
don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers.
The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code.
### Consolidate Relation Size Keys
We have relsize at the end of all relation nodes.
```plain
// RelSize:
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
```
This means that computing logical size requires us to do several single-key gets across the keyspace,
potentially requiring downloading many layer files. We could consolidate them into a single
keyspace, improving logical size calculation performance.
### Migrate DBDir Keys
We assume the number of databases created by the users will be small, and therefore, the current way
of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into
the sparse keyspace to support large amount of databases.

View File

@@ -101,25 +101,15 @@ changes such as a pageserver node becoming unavailable, or the tenant's shard co
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
location changes.
The hook is configured using the storage controller's `--control-plane-url` CLI option, from which the hook URL is computed.
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
Currently, there is two hooks, each computed by appending the name to the provided control plane URL prefix:
- `notify-attach`, called whenever attachment for pageservers changes
- `notify-safekeepers`, called whenever attachment for safekeepers changes
If the hooks require JWT auth, the token may be provided with `--control-plane-jwt-token`.
The hooks will be invoked with a `PUT` request.
In the Neon cloud service, these hooks are implemented by Neon's internal cloud control plane. In `neon_local` systems,
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
the compute hook.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hooks. This is not complicated.
### `notify-attach` body
The `notify-attach` request body follows the format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
```
struct ComputeHookNotifyRequestShard {
@@ -138,15 +128,15 @@ When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.pageserver_connstring` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
shards identified by `NodeId` must be converted to the address+port of the node.
- if stripe_size is not None, set `neon.shard_stripe_size` to this value
- if stripe_size is not None, set `neon.stripe_size` to this value
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..
Example body:
### Example notification body
```
{
@@ -158,34 +148,3 @@ Example body:
],
}
```
### `notify-safekeepers` body
The `notify-safekeepers` request body forllows the format of the `SafekeepersNotifyRequest` structure, provided below for convenience.
```
pub struct SafekeeperInfo {
pub id: NodeId,
pub hostname: String,
}
pub struct SafekeepersNotifyRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub generation: u32,
pub safekeepers: Vec<SafekeeperInfo>,
}
```
When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.safekeeper_connstrings` to an array of postgres connection strings to safekeepers according to the `safekeepers` list. The
safekeepers identified by `NodeId` must be converted to the address+port of the respective safekeeper.
The hostname is provided for debugging purposes, so we reserve changes to how we pass it.
- set `neon.safekeepers_generation` to the provided `generation` value.
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..

View File

@@ -7,7 +7,6 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
chrono.workspace = true
indexmap.workspace = true
jsonwebtoken.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -134,12 +134,9 @@ pub struct CatalogObjects {
pub databases: Vec<Database>,
}
#[derive(Clone, Debug, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize)]
pub struct ComputeCtlConfig {
/// Set of JSON web keys that the compute can use to authenticate
/// communication from the control plane.
pub jwks: JwkSet,
pub tls: Option<TlsConfig>,
}
impl Default for ComputeCtlConfig {
@@ -148,17 +145,10 @@ impl Default for ComputeCtlConfig {
jwks: JwkSet {
keys: Vec::default(),
},
tls: None,
}
}
}
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct TlsConfig {
pub key_path: String,
pub cert_path: String,
}
/// Response of the `/computes/{compute_id}/spec` control-plane API.
#[derive(Deserialize, Debug)]
pub struct ControlPlaneSpecResponse {

View File

@@ -5,15 +5,12 @@
//! and connect it to the storage nodes.
use std::collections::HashMap;
use indexmap::IndexMap;
use regex::Regex;
use remote_storage::RemotePath;
use serde::{Deserialize, Serialize};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use crate::responses::TlsConfig;
/// String type alias representing Postgres identifier and
/// intended to be used for DB / role names.
pub type PgIdent = String;
@@ -104,17 +101,6 @@ pub struct ComputeSpec {
pub timeline_id: Option<TimelineId>,
pub pageserver_connstring: Option<String>,
/// Safekeeper membership config generation. It is put in
/// neon.safekeepers GUC and serves two purposes:
/// 1) Non zero value forces walproposer to use membership configurations.
/// 2) If walproposer wants to update list of safekeepers to connect to
/// taking them from some safekeeper mconf, it should check what value
/// is newer by comparing the generation.
///
/// Note: it could be SafekeeperGeneration, but this needs linking
/// compute_ctl with postgres_ffi.
#[serde(default)]
pub safekeepers_generation: Option<u32>,
#[serde(default)]
pub safekeeper_connstrings: Vec<String>,
@@ -128,7 +114,7 @@ pub struct ComputeSpec {
// information about available remote extensions
pub remote_extensions: Option<RemoteExtSpec>,
pub pgbouncer_settings: Option<IndexMap<String, String>>,
pub pgbouncer_settings: Option<HashMap<String, String>>,
// Stripe size for pageserver sharding, in pages
#[serde(default)]
@@ -158,16 +144,6 @@ pub struct ComputeSpec {
/// over the same replication content from publisher.
#[serde(default)] // Default false
pub drop_subscriptions_before_start: bool,
/// Log level for audit logging:
///
/// Disabled - no audit logging. This is the default.
/// log - log masked statements to the postgres log using pgaudit extension
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
///
/// Extensions should be present in shared_preload_libraries
#[serde(default)]
pub audit_log_level: ComputeAudit,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -275,17 +251,6 @@ pub enum ComputeMode {
Replica,
}
/// Log level for audit logging
/// Disabled, log, hipaa
/// Default is Disabled
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeAudit {
#[default]
Disabled,
Log,
Hipaa,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
pub struct Cluster {
pub cluster_id: Option<String>,
@@ -360,9 +325,6 @@ pub struct LocalProxySpec {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub jwks: Option<Vec<JwksSettings>>,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub tls: Option<TlsConfig>,
}
#[derive(Clone, Debug, Deserialize, Serialize)]

View File

@@ -6,10 +6,11 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
backtrace.workspace = true
bytes.workspace = true
camino.workspace = true
inferno.workspace = true
fail.workspace = true
futures.workspace = true
flate2.workspace = true
hyper0.workspace = true
itertools.workspace = true
jemalloc_pprof.workspace = true
@@ -17,14 +18,12 @@ once_cell.workspace = true
pprof.workspace = true
regex.workspace = true
routerify.workspace = true
rustls-pemfile.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_path_to_error.workspace = true
thiserror.workspace = true
tracing.workspace = true
tokio.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
url.workspace = true
uuid.workspace = true

View File

@@ -3,6 +3,8 @@ use std::io::Write as _;
use std::str::FromStr;
use std::time::Duration;
use ::pprof::ProfilerGuardBuilder;
use ::pprof::protos::Message as _;
use anyhow::{Context, anyhow};
use bytes::{Bytes, BytesMut};
use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
@@ -10,8 +12,7 @@ use hyper::http::HeaderValue;
use hyper::{Body, Method, Request, Response};
use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
use once_cell::sync::Lazy;
use pprof::ProfilerGuardBuilder;
use pprof::protos::Message as _;
use regex::Regex;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tokio::sync::{Mutex, Notify, mpsc};
@@ -21,6 +22,7 @@ use tracing::{Instrument, debug, info, info_span, warn};
use utils::auth::{AuthError, Claims, SwappableJwtAuth};
use crate::error::{ApiError, api_error_handler, route_error_handler};
use crate::pprof;
use crate::request::{get_query_param, parse_query_param};
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -399,10 +401,12 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
// Return the report in the requested format.
match format {
Format::Pprof => {
let body = report
let mut body = Vec::new();
report
.pprof()
.map_err(|err| ApiError::InternalServerError(err.into()))?
.encode_to_vec();
.write_to_vec(&mut body)
.map_err(|err| ApiError::InternalServerError(err.into()))?;
Response::builder()
.status(200)
@@ -445,6 +449,20 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
};
// Functions and mappings to strip when symbolizing pprof profiles. If true,
// also remove child frames.
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
vec![
(Regex::new("^__rust").unwrap(), false),
(Regex::new("^_start$").unwrap(), false),
(Regex::new("^irallocx_prof").unwrap(), true),
(Regex::new("^prof_alloc_prep").unwrap(), true),
(Regex::new("^std::rt::lang_start").unwrap(), false),
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
]
});
const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
// Obtain profiler handle.
let mut prof_ctl = jemalloc_pprof::PROF_CTL
.as_ref()
@@ -477,27 +495,45 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
}
Format::Pprof => {
let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
let data = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
// Symbolize the profile.
// TODO: consider moving this upstream to jemalloc_pprof and avoiding the
// serialization roundtrip.
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
pprof::encode(&profile)
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "application/octet-stream")
.header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"")
.header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
.body(Body::from(data))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
Format::Svg => {
let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph())
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
let body = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
let mut opts = inferno::flamegraph::Options::default();
opts.title = "Heap inuse".to_string();
opts.count_name = "bytes".to_string();
pprof::flamegraph(profile, &mut opts)
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "image/svg+xml")
.body(Body::from(svg))
.body(Body::from(body))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
}

View File

@@ -2,12 +2,11 @@ pub mod endpoint;
pub mod error;
pub mod failpoints;
pub mod json;
pub mod pprof;
pub mod request;
pub mod server;
pub mod tls_certs;
extern crate hyper0 as hyper;
/// Current fast way to apply simple http routing in various Neon binaries.
/// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
pub use routerify::{RequestServiceBuilder, RouterBuilder, RouterService, ext::RequestExt};
pub use routerify::{RouterBuilder, RouterService, ext::RequestExt};

View File

@@ -0,0 +1,238 @@
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::ffi::c_void;
use std::io::Write as _;
use anyhow::bail;
use flate2::Compression;
use flate2::write::{GzDecoder, GzEncoder};
use itertools::Itertools as _;
use pprof::protos::{Function, Line, Location, Message as _, Profile};
use regex::Regex;
/// Decodes a gzip-compressed Protobuf-encoded pprof profile.
pub fn decode(bytes: &[u8]) -> anyhow::Result<Profile> {
let mut gz = GzDecoder::new(Vec::new());
gz.write_all(bytes)?;
Ok(Profile::parse_from_bytes(&gz.finish()?)?)
}
/// Encodes a pprof profile as gzip-compressed Protobuf.
pub fn encode(profile: &Profile) -> anyhow::Result<Vec<u8>> {
let mut gz = GzEncoder::new(Vec::new(), Compression::default());
profile.write_to_writer(&mut gz)?;
Ok(gz.finish()?)
}
/// Symbolizes a pprof profile using the current binary.
pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
if !profile.function.is_empty() {
return Ok(profile); // already symbolized
}
// Collect function names.
let mut functions: HashMap<String, Function> = HashMap::new();
let mut strings: HashMap<String, i64> = profile
.string_table
.into_iter()
.enumerate()
.map(|(i, s)| (s, i as i64))
.collect();
// Helper to look up or register a string.
let mut string_id = |s: &str| -> i64 {
// Don't use .entry() to avoid unnecessary allocations.
if let Some(id) = strings.get(s) {
return *id;
}
let id = strings.len() as i64;
strings.insert(s.to_string(), id);
id
};
for loc in &mut profile.location {
if !loc.line.is_empty() {
continue;
}
// Resolve the line and function for each location.
backtrace::resolve(loc.address as *mut c_void, |symbol| {
let Some(symbol_name) = symbol.name() else {
return;
};
let function_name = format!("{symbol_name:#}");
let functions_len = functions.len();
let function_id = functions
.entry(function_name)
.or_insert_with_key(|function_name| {
let function_id = functions_len as u64 + 1;
let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
let filename = symbol
.filename()
.map(|path| path.to_string_lossy())
.unwrap_or(Cow::Borrowed(""));
Function {
id: function_id,
name: string_id(function_name),
system_name: string_id(&system_name),
filename: string_id(&filename),
..Default::default()
}
})
.id;
loc.line.push(Line {
function_id,
line: symbol.lineno().unwrap_or(0) as i64,
..Default::default()
});
});
}
// Store the resolved functions, and mark the mapping as resolved.
profile.function = functions.into_values().sorted_by_key(|f| f.id).collect();
profile.string_table = strings
.into_iter()
.sorted_by_key(|(_, i)| *i)
.map(|(s, _)| s)
.collect();
for mapping in &mut profile.mapping {
mapping.has_functions = true;
mapping.has_filenames = true;
}
Ok(profile)
}
/// Strips locations (stack frames) matching the given mappings (substring) or function names
/// (regex). The function bool specifies whether child frames should be stripped as well.
///
/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all
/// string references.
pub fn strip_locations(
mut profile: Profile,
mappings: &[&str],
functions: &[(Regex, bool)],
) -> Profile {
// Strip mappings.
let mut strip_mappings: HashSet<u64> = HashSet::new();
profile.mapping.retain(|mapping| {
let Some(name) = profile.string_table.get(mapping.filename as usize) else {
return true;
};
if mappings.iter().any(|substr| name.contains(substr)) {
strip_mappings.insert(mapping.id);
return false;
}
true
});
// Strip functions.
let mut strip_functions: HashMap<u64, bool> = HashMap::new();
profile.function.retain(|function| {
let Some(name) = profile.string_table.get(function.name as usize) else {
return true;
};
for (regex, strip_children) in functions {
if regex.is_match(name) {
strip_functions.insert(function.id, *strip_children);
return false;
}
}
true
});
// Strip locations. The bool specifies whether child frames should be stripped too.
let mut strip_locations: HashMap<u64, bool> = HashMap::new();
profile.location.retain(|location| {
for line in &location.line {
if let Some(strip_children) = strip_functions.get(&line.function_id) {
strip_locations.insert(location.id, *strip_children);
return false;
}
}
if strip_mappings.contains(&location.mapping_id) {
strip_locations.insert(location.id, false);
return false;
}
true
});
// Strip sample locations.
for sample in &mut profile.sample {
// First, find the uppermost function with child removal and truncate the stack.
if let Some(truncate) = sample
.location_id
.iter()
.rposition(|id| strip_locations.get(id) == Some(&true))
{
sample.location_id.drain(..=truncate);
}
// Next, strip any individual frames without child removal.
sample
.location_id
.retain(|id| !strip_locations.contains_key(id));
}
profile
}
/// Generates an SVG flamegraph from a symbolized pprof profile.
pub fn flamegraph(
profile: Profile,
opts: &mut inferno::flamegraph::Options,
) -> anyhow::Result<Vec<u8>> {
if profile.mapping.iter().any(|m| !m.has_functions) {
bail!("profile not symbolized");
}
// Index locations, functions, and strings.
let locations: HashMap<u64, Location> =
profile.location.into_iter().map(|l| (l.id, l)).collect();
let functions: HashMap<u64, Function> =
profile.function.into_iter().map(|f| (f.id, f)).collect();
let strings = profile.string_table;
// Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
// since inferno expects it bottom-up.
let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
for sample in profile.sample {
let mut stack = Vec::with_capacity(sample.location_id.len());
for location in sample.location_id.into_iter().rev() {
let Some(location) = locations.get(&location) else {
bail!("missing location {location}");
};
for line in location.line.iter().rev() {
let Some(function) = functions.get(&line.function_id) else {
bail!("missing function {}", line.function_id);
};
let Some(name) = strings.get(function.name as usize) else {
bail!("missing string {}", function.name);
};
stack.push(name.as_str());
}
}
let Some(&value) = sample.value.first() else {
bail!("missing value");
};
*stacks.entry(stack).or_default() += value;
}
// Construct stack lines for inferno.
let lines = stacks
.into_iter()
.map(|(stack, value)| (stack.into_iter().join(";"), value))
.map(|(stack, value)| format!("{stack} {value}"))
.sorted()
.collect_vec();
// Construct the flamegraph.
let mut bytes = Vec::new();
let lines = lines.iter().map(|line| line.as_str());
inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
Ok(bytes)
}

View File

@@ -1,155 +0,0 @@
use std::{error::Error, sync::Arc};
use futures::StreamExt;
use futures::stream::FuturesUnordered;
use hyper0::Body;
use hyper0::server::conn::Http;
use routerify::{RequestService, RequestServiceBuilder};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor;
use tokio_util::sync::CancellationToken;
use tracing::{error, info};
use crate::error::ApiError;
/// A simple HTTP server over hyper library.
/// You may want to use it instead of [`hyper0::server::Server`] because:
/// 1. hyper0's Server was removed from hyper v1.
/// It's recommended to replace hyepr0's Server with a manual loop, which is done here.
/// 2. hyper0's Server doesn't support TLS out of the box, and there is no way
/// to support it efficiently with the Accept trait that hyper0's Server uses.
/// That's one of the reasons why it was removed from v1.
/// <https://github.com/hyperium/hyper/blob/115339d3df50f20c8717680aa35f48858e9a6205/docs/ROADMAP.md#higher-level-client-and-server-problems>
pub struct Server {
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
listener: tokio::net::TcpListener,
tls_acceptor: Option<TlsAcceptor>,
}
impl Server {
pub fn new(
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
listener: std::net::TcpListener,
tls_acceptor: Option<TlsAcceptor>,
) -> anyhow::Result<Self> {
// Note: caller of from_std is responsible for setting nonblocking mode.
listener.set_nonblocking(true)?;
let listener = tokio::net::TcpListener::from_std(listener)?;
Ok(Self {
request_service,
listener,
tls_acceptor,
})
}
pub async fn serve(self, cancel: CancellationToken) -> anyhow::Result<()> {
fn suppress_io_error(err: &std::io::Error) -> bool {
use std::io::ErrorKind::*;
matches!(err.kind(), ConnectionReset | ConnectionAborted | BrokenPipe)
}
fn suppress_hyper_error(err: &hyper0::Error) -> bool {
if err.is_incomplete_message() || err.is_closed() || err.is_timeout() {
return true;
}
if let Some(inner) = err.source() {
if let Some(io) = inner.downcast_ref::<std::io::Error>() {
return suppress_io_error(io);
}
}
false
}
let mut connections = FuturesUnordered::new();
loop {
tokio::select! {
stream = self.listener.accept() => {
let (tcp_stream, remote_addr) = match stream {
Ok(stream) => stream,
Err(err) => {
if !suppress_io_error(&err) {
info!("Failed to accept TCP connection: {err:#}");
}
continue;
}
};
let service = self.request_service.build(remote_addr);
let tls_acceptor = self.tls_acceptor.clone();
let cancel = cancel.clone();
connections.push(tokio::spawn(
async move {
match tls_acceptor {
Some(tls_acceptor) => {
// Handle HTTPS connection.
let tls_stream = tokio::select! {
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
_ = cancel.cancelled() => return,
};
let tls_stream = match tls_stream {
Ok(tls_stream) => tls_stream,
Err(err) => {
if !suppress_io_error(&err) {
info!("Failed to accept TLS connection: {err:#}");
}
return;
}
};
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
if !suppress_hyper_error(&err) {
info!("Failed to serve HTTPS connection: {err:#}");
}
}
}
None => {
// Handle HTTP connection.
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
if !suppress_hyper_error(&err) {
info!("Failed to serve HTTP connection: {err:#}");
}
}
}
};
}));
}
Some(conn) = connections.next() => {
if let Err(err) = conn {
error!("Connection panicked: {err:#}");
}
}
_ = cancel.cancelled() => {
// Wait for graceful shutdown of all connections.
while let Some(conn) = connections.next().await {
if let Err(err) = conn {
error!("Connection panicked: {err:#}");
}
}
break;
}
}
}
Ok(())
}
/// Serves HTTP connection with graceful shutdown.
async fn serve_connection<I>(
io: I,
service: RequestService<Body, ApiError>,
cancel: CancellationToken,
) -> Result<(), hyper0::Error>
where
I: AsyncRead + AsyncWrite + Unpin + Send + 'static,
{
let mut conn = Http::new().serve_connection(io, service).with_upgrades();
tokio::select! {
res = &mut conn => res,
_ = cancel.cancelled() => {
Pin::new(&mut conn).graceful_shutdown();
// Note: connection should still be awaited for graceful shutdown to complete.
conn.await
}
}
}
}

View File

@@ -1,21 +0,0 @@
use camino::Utf8Path;
use tokio_rustls::rustls::pki_types::{CertificateDer, PrivateKeyDer};
pub fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
let file = std::fs::File::open(filename)?;
let mut reader = std::io::BufReader::new(file);
Ok(rustls_pemfile::certs(&mut reader).collect::<Result<Vec<_>, _>>()?)
}
pub fn load_private_key(filename: &Utf8Path) -> anyhow::Result<PrivateKeyDer<'static>> {
let file = std::fs::File::open(filename)?;
let mut reader = std::io::BufReader::new(file);
let key = rustls_pemfile::private_key(&mut reader)?;
key.ok_or(anyhow::anyhow!(
"no private key found in {}",
filename.as_str(),
))
}

View File

@@ -35,7 +35,6 @@ pub struct NodeMetadata {
pub postgres_port: u16,
pub http_host: String,
pub http_port: u16,
pub https_port: Option<u16>,
// Deployment tools may write fields to the metadata file beyond what we
// use in this type: this type intentionally only names fields that require.
@@ -58,9 +57,6 @@ pub struct ConfigToml {
// types mapped 1:1 into the runtime PageServerConfig type
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub ssl_key_file: Utf8PathBuf,
pub ssl_cert_file: Utf8PathBuf,
pub availability_zone: Option<String>,
#[serde(with = "humantime_serde")]
pub wait_lsn_timeout: Duration,
@@ -127,10 +123,6 @@ pub struct ConfigToml {
pub enable_read_path_debugging: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub validate_wal_contiguity: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub load_previous_heatmap: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub generate_unarchival_heatmap: Option<bool>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -272,16 +264,15 @@ pub struct TenantConfigToml {
/// size exceeds `compaction_upper_limit * checkpoint_distance`.
pub compaction_upper_limit: usize,
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
/// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
/// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
/// If true, compact down L0 across all tenant timelines before doing regular compaction.
pub compaction_l0_first: bool,
/// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only
/// has an effect if `compaction_l0_first` is true. Defaults to true.
/// has an effect if `compaction_l0_first` is `true`.
pub compaction_l0_semaphore: bool,
/// Level0 delta layer threshold at which to delay layer flushes such that they take 2x as long,
/// and block on layer flushes during ephemeral layer rolls, for compaction backpressure. This
/// helps compaction keep up with WAL ingestion, and avoids read amplification blowing up.
/// Should be >compaction_threshold. 0 to disable. Defaults to 3x compaction_threshold.
/// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
/// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
/// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
/// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default.
pub l0_flush_delay_threshold: Option<usize>,
/// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
/// to avoid deadlock. 0 to disable. Disabled by default.
@@ -289,8 +280,6 @@ pub struct TenantConfigToml {
/// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next
/// layer. This is a temporary backpressure mechanism which should be removed once
/// l0_flush_{delay,stall}_threshold is fully enabled.
///
/// TODO: this is no longer enabled, remove it when the config option is no longer set.
pub l0_flush_wait_upload: bool,
// Determines how much history is retained, to allow
// branching and read replicas at an older point in time.
@@ -428,9 +417,6 @@ pub mod defaults {
pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
utils::postgres_client::PostgresClientProtocol::Vanilla;
pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
}
impl Default for ConfigToml {
@@ -440,9 +426,6 @@ impl Default for ConfigToml {
Self {
listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
listen_https_addr: (None),
ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE),
ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE),
availability_zone: (None),
wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
.expect("cannot parse default wait lsn timeout")),
@@ -540,8 +523,6 @@ impl Default for ConfigToml {
None
},
validate_wal_contiguity: None,
load_previous_heatmap: None,
generate_unarchival_heatmap: None,
}
}
}
@@ -570,15 +551,13 @@ pub mod tenant_conf_defaults {
// be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
// with this config, we can get a maximum peak compaction usage of 9 GB.
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
// Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
// read amp.
pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
crate::models::CompactionAlgorithm::Legacy;
pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false;
pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true;
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
@@ -589,8 +568,9 @@ pub mod tenant_conf_defaults {
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
// If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
// layer creation will end immediately. Set to 0 to disable.
pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
// layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we
// want to enable this feature.
pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0;
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";

View File

@@ -16,30 +16,6 @@ fn test_node_metadata_v1_backward_compatibilty() {
postgres_port: 23,
http_host: "localhost".to_string(),
http_port: 42,
https_port: None,
other: HashMap::new(),
}
)
}
#[test]
fn test_node_metadata_v2_backward_compatibilty() {
let v2 = serde_json::to_vec(&serde_json::json!({
"host": "localhost",
"port": 23,
"http_host": "localhost",
"http_port": 42,
"https_port": 123,
}));
assert_eq!(
serde_json::from_slice::<NodeMetadata>(&v2.unwrap()).unwrap(),
NodeMetadata {
postgres_host: "localhost".to_string(),
postgres_port: 23,
http_host: "localhost".to_string(),
http_port: 42,
https_port: Some(123),
other: HashMap::new(),
}
)

View File

@@ -182,66 +182,20 @@ pub struct TenantDescribeResponseShard {
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
/// Optionally, callers may specify the node they are migrating _from_, and the server will
/// reject the request if the shard is no longer attached there: this enables writing safer
/// clients that don't risk fighting with some other movement of the shard.
#[serde(default)]
pub origin_node_id: Option<NodeId>,
#[serde(default)]
pub migration_config: MigrationConfig,
pub migration_config: Option<MigrationConfig>,
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
#[derive(Serialize, Deserialize, Debug)]
pub struct MigrationConfig {
/// If true, the migration will be executed even if it is to a location with a sub-optimal scheduling
/// score: this is usually not what you want, and if you use this then you'll also need to set the
/// tenant's scheduling policy to Essential or Pause to avoid the optimiser reverting your migration.
///
/// Default: false
#[serde(default)]
pub override_scheduler: bool,
/// If true, the migration will be done gracefully by creating a secondary location first and
/// waiting for it to warm up before cutting over. If false, if there is no existing secondary
/// location at the destination, the tenant will be migrated immediately. If the tenant's data
/// can't be downloaded within [`Self::secondary_warmup_timeout`], then the migration will go
/// ahead but run with a cold cache that can severely reduce performance until it warms up.
///
/// When doing a graceful migration, the migration API returns as soon as it is started.
///
/// Default: true
#[serde(default = "default_prewarm")]
pub prewarm: bool,
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
/// overall for secondary warmup before cutting over
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_warmup_timeout: Option<Duration>,
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
/// within each secondary download poll call to pageserver.
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_download_request_timeout: Option<Duration>,
}
fn default_prewarm() -> bool {
true
}
impl Default for MigrationConfig {
fn default() -> Self {
Self {
override_scheduler: false,
prewarm: default_prewarm(),
secondary_warmup_timeout: None,
secondary_download_request_timeout: None,
}
}
}
#[derive(Serialize, Clone, Debug)]
#[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability {
@@ -489,7 +443,6 @@ pub struct SafekeeperDescribeResponse {
pub host: String,
pub port: i32,
pub http_port: i32,
pub https_port: Option<i32>,
pub availability_zone_id: String,
pub scheduling_policy: SkSchedulingPolicy,
}
@@ -534,43 +487,4 @@ mod test {
err
);
}
/// Check that a minimal migrate request with no config results in the expected default settings
#[test]
fn test_migrate_request_decode_defaults() {
let json = r#"{
"node_id": 123
}"#;
let request: TenantShardMigrateRequest = serde_json::from_str(json).unwrap();
assert_eq!(request.node_id, NodeId(123));
assert_eq!(request.origin_node_id, None);
assert!(!request.migration_config.override_scheduler);
assert!(request.migration_config.prewarm);
assert_eq!(request.migration_config.secondary_warmup_timeout, None);
assert_eq!(
request.migration_config.secondary_download_request_timeout,
None
);
}
/// Check that a partially specified migration config results in the expected default settings
#[test]
fn test_migration_config_decode_defaults() {
// Specify just one field of the config
let json = r#"{
}"#;
let config: MigrationConfig = serde_json::from_str(json).unwrap();
// Check each field's expected default value
assert!(!config.override_scheduler);
assert!(config.prewarm);
assert_eq!(config.secondary_warmup_timeout, None);
assert_eq!(config.secondary_download_request_timeout, None);
assert_eq!(config.secondary_warmup_timeout, None);
// Consistency check that the Default impl agrees with our serde defaults
assert_eq!(MigrationConfig::default(), config);
}
}

View File

@@ -176,39 +176,6 @@ impl LsnLease {
}
}
/// Controls the detach ancestor behavior.
/// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point.
/// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all.
#[derive(Debug, Clone, Copy, Default)]
pub enum DetachBehavior {
#[default]
NoAncestorAndReparent,
MultiLevelAndNoReparent,
}
impl std::str::FromStr for DetachBehavior {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"no_ancestor_and_reparent" => Ok(DetachBehavior::NoAncestorAndReparent),
"multi_level_and_no_reparent" => Ok(DetachBehavior::MultiLevelAndNoReparent),
"v1" => Ok(DetachBehavior::NoAncestorAndReparent),
"v2" => Ok(DetachBehavior::MultiLevelAndNoReparent),
_ => Err("cannot parse detach behavior"),
}
}
}
impl std::fmt::Display for DetachBehavior {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
DetachBehavior::NoAncestorAndReparent => write!(f, "no_ancestor_and_reparent"),
DetachBehavior::MultiLevelAndNoReparent => write!(f, "multi_level_and_no_reparent"),
}
}
}
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
///
/// XXX: We used to have more variants here, but now it's just one, which makes this rather
@@ -307,31 +274,6 @@ pub struct TimelineCreateRequest {
pub mode: TimelineCreateRequestMode,
}
/// Storage controller specific extensions to [`TimelineInfo`].
#[derive(Serialize, Deserialize, Clone)]
pub struct TimelineCreateResponseStorcon {
#[serde(flatten)]
pub timeline_info: TimelineInfo,
pub safekeepers: Option<SafekeepersInfo>,
}
/// Safekeepers as returned in timeline creation request to storcon or pushed to
/// cplane in the post migration hook.
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeepersInfo {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub generation: u32,
pub safekeepers: Vec<SafekeeperInfo>,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeeperInfo {
pub id: NodeId,
pub hostname: String,
}
#[derive(Serialize, Deserialize, Clone)]
#[serde(untagged)]
pub enum TimelineCreateRequestMode {
@@ -1204,15 +1146,6 @@ pub struct TimelineArchivalConfigRequest {
pub state: TimelineArchivalState,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct TimelinePatchIndexPartRequest {
pub rel_size_migration: Option<RelSizeMigration>,
pub gc_compaction_last_completed_lsn: Option<Lsn>,
pub applied_gc_cutoff_lsn: Option<Lsn>,
#[serde(default)]
pub force_index_update: bool,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelinesInfoAndOffloaded {
pub timelines: Vec<TimelineInfo>,
@@ -1232,21 +1165,6 @@ pub struct OffloadedTimelineInfo {
pub archived_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum RelSizeMigration {
/// The tenant is using the old rel_size format.
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
Legacy,
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
/// persisted in the index part. The read path will read both formats and merge them.
Migrating,
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
/// in the index part, and the read path will not read the old format.
Migrated,
}
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
@@ -1258,10 +1176,9 @@ pub struct TimelineInfo {
pub last_record_lsn: Lsn,
pub prev_record_lsn: Option<Lsn>,
/// Legacy field, retained for one version to enable old storage controller to
/// decode (it was a mandatory field).
#[serde(default, rename = "latest_gc_cutoff_lsn")]
pub _unused: Lsn,
/// Legacy field for compat with control plane. Synonym of `min_readable_lsn`.
/// TODO: remove once control plane no longer reads it.
pub latest_gc_cutoff_lsn: Lsn,
/// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
/// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
@@ -1326,11 +1243,7 @@ pub struct TimelineInfo {
// Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
// not deny unknown fields by default so it's safe to set the field to some value, though it won't be
// read.
/// Whether the timeline is archived.
pub is_archived: Option<bool>,
/// The status of the rel_size migration.
pub rel_size_migration: Option<RelSizeMigration>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -1510,14 +1423,8 @@ pub struct TenantScanRemoteStorageResponse {
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "snake_case")]
pub enum TenantSorting {
/// Total size of layers on local disk for all timelines in a shard.
ResidentSize,
/// The logical size of the largest timeline within a _tenant_ (not shard). Only tracked on
/// shard 0, contains the sum across all shards.
MaxLogicalSize,
/// The logical size of the largest timeline within a _tenant_ (not shard), divided by number of
/// shards. Only tracked on shard 0, and estimates the per-shard logical size.
MaxLogicalSizePerShard,
}
impl Default for TenantSorting {
@@ -1547,20 +1454,14 @@ pub struct TopTenantShardsRequest {
pub struct TopTenantShardItem {
pub id: TenantShardId,
/// Total size of layers on local disk for all timelines in this shard.
/// Total size of layers on local disk for all timelines in this tenant
pub resident_size: u64,
/// Total size of layers in remote storage for all timelines in this shard.
/// Total size of layers in remote storage for all timelines in this tenant
pub physical_size: u64,
/// The largest logical size of a timeline within this _tenant_ (not shard). This is only
/// tracked on shard 0, and contains the sum of the logical size across all shards.
/// The largest logical size of a timeline within this tenant
pub max_logical_size: u64,
/// The largest logical size of a timeline within this _tenant_ (not shard) divided by number of
/// shards. This is only tracked on shard 0, and is only an estimate as we divide it evenly by
/// shard count, rounded up.
pub max_logical_size_per_shard: u64,
}
#[derive(Serialize, Deserialize, Debug, Default)]

View File

@@ -112,16 +112,6 @@ impl ShardIdentity {
}
}
/// An unsharded identity with the given stripe size (if non-zero). This is typically used to
/// carry over a stripe size for an unsharded tenant from persistent storage.
pub fn unsharded_with_stripe_size(stripe_size: ShardStripeSize) -> Self {
let mut shard_identity = Self::unsharded();
if stripe_size.0 > 0 {
shard_identity.stripe_size = stripe_size;
}
shard_identity
}
/// A broken instance of this type is only used for `TenantState::Broken` tenants,
/// which are constructed in code paths that don't have access to proper configuration.
///

View File

@@ -396,14 +396,6 @@ pub mod waldecoder {
self.lsn + self.inputbuf.remaining() as u64
}
/// Returns the LSN up to which the WAL decoder has processed.
///
/// If [`Self::poll_decode`] returned a record, then this will return
/// the end LSN of said record.
pub fn lsn(&self) -> Lsn {
self.lsn
}
pub fn feed_bytes(&mut self, buf: &[u8]) {
self.inputbuf.extend_from_slice(buf);
}

View File

@@ -135,8 +135,8 @@ impl Type {
pub enum Kind {
/// A simple type like `VARCHAR` or `INTEGER`.
Simple,
/// An enumerated type.
Enum,
/// An enumerated type along with its variants.
Enum(Vec<String>),
/// A pseudo-type.
Pseudo,
/// An array type along with the type of its elements.
@@ -146,9 +146,9 @@ pub enum Kind {
/// A multirange type along with the type of its elements.
Multirange(Type),
/// A domain type along with its underlying type.
Domain(Oid),
/// A composite type.
Composite(Oid),
Domain(Type),
/// A composite type along with information about its fields.
Composite(Vec<Field>),
}
/// Information about a field of a composite type.

View File

@@ -34,13 +34,8 @@ where
.make_tls_connect(hostname)
.map_err(|e| Error::tls(e.into()))?;
let socket = connect_socket::connect_socket(
config.host_addr,
&config.host,
config.port,
config.connect_timeout,
)
.await?;
let socket =
connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?;
cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await
}

View File

@@ -1,6 +1,5 @@
use std::collections::HashMap;
use std::fmt;
use std::net::IpAddr;
use std::sync::Arc;
use std::task::{Context, Poll};
use std::time::Duration;
@@ -19,10 +18,10 @@ use crate::config::{Host, SslMode};
use crate::connection::{Request, RequestMessages};
use crate::query::RowStream;
use crate::simple_query::SimpleQueryStream;
use crate::types::{Oid, Type};
use crate::types::{Oid, ToSql, Type};
use crate::{
CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Statement, Transaction,
TransactionBuilder, query, simple_query,
CancelToken, Error, ReadyForQueryStatus, Row, SimpleQueryMessage, Statement, Transaction,
TransactionBuilder, query, simple_query, slice_iter,
};
pub struct Responses {
@@ -54,18 +53,26 @@ impl Responses {
/// A cache of type info and prepared statements for fetching type info
/// (corresponding to the queries in the [crate::prepare] module).
#[derive(Default)]
pub(crate) struct CachedTypeInfo {
struct CachedTypeInfo {
/// A statement for basic information for a type from its
/// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
/// fallback).
pub(crate) typeinfo: Option<Statement>,
typeinfo: Option<Statement>,
/// A statement for getting information for a composite type from its OID.
/// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY).
typeinfo_composite: Option<Statement>,
/// A statement for getting information for a composite type from its OID.
/// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or
/// its fallback).
typeinfo_enum: Option<Statement>,
/// Cache of types already looked up.
pub(crate) types: HashMap<Oid, Type>,
types: HashMap<Oid, Type>,
}
pub struct InnerClient {
sender: mpsc::UnboundedSender<Request>,
cached_typeinfo: Mutex<CachedTypeInfo>,
/// A buffer to use when writing out postgres commands.
buffer: Mutex<BytesMut>,
@@ -83,6 +90,38 @@ impl InnerClient {
})
}
pub fn typeinfo(&self) -> Option<Statement> {
self.cached_typeinfo.lock().typeinfo.clone()
}
pub fn set_typeinfo(&self, statement: &Statement) {
self.cached_typeinfo.lock().typeinfo = Some(statement.clone());
}
pub fn typeinfo_composite(&self) -> Option<Statement> {
self.cached_typeinfo.lock().typeinfo_composite.clone()
}
pub fn set_typeinfo_composite(&self, statement: &Statement) {
self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone());
}
pub fn typeinfo_enum(&self) -> Option<Statement> {
self.cached_typeinfo.lock().typeinfo_enum.clone()
}
pub fn set_typeinfo_enum(&self, statement: &Statement) {
self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone());
}
pub fn type_(&self, oid: Oid) -> Option<Type> {
self.cached_typeinfo.lock().types.get(&oid).cloned()
}
pub fn set_type(&self, oid: Oid, type_: &Type) {
self.cached_typeinfo.lock().types.insert(oid, type_.clone());
}
/// Call the given function with a buffer to be used when writing out
/// postgres commands.
pub fn with_buf<F, R>(&self, f: F) -> R
@@ -98,10 +137,10 @@ impl InnerClient {
#[derive(Clone, Serialize, Deserialize)]
pub struct SocketConfig {
pub host_addr: Option<IpAddr>,
pub host: Host,
pub port: u16,
pub connect_timeout: Option<Duration>,
// pub keepalive: Option<KeepaliveConfig>,
}
/// An asynchronous PostgreSQL client.
@@ -110,7 +149,6 @@ pub struct SocketConfig {
/// through this client object.
pub struct Client {
inner: Arc<InnerClient>,
cached_typeinfo: CachedTypeInfo,
socket_config: SocketConfig,
ssl_mode: SslMode,
@@ -129,9 +167,9 @@ impl Client {
Client {
inner: Arc::new(InnerClient {
sender,
cached_typeinfo: Default::default(),
buffer: Default::default(),
}),
cached_typeinfo: Default::default(),
socket_config,
ssl_mode,
@@ -149,6 +187,55 @@ impl Client {
&self.inner
}
/// Executes a statement, returning a vector of the resulting rows.
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
/// provided, 1-indexed.
///
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
/// with the `prepare` method.
///
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
pub async fn query(
&self,
statement: Statement,
params: &[&(dyn ToSql + Sync)],
) -> Result<Vec<Row>, Error> {
self.query_raw(statement, slice_iter(params))
.await?
.try_collect()
.await
}
/// The maximally flexible version of [`query`].
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
/// provided, 1-indexed.
///
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
/// with the `prepare` method.
///
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
///
/// [`query`]: #method.query
pub async fn query_raw<'a, I>(
&self,
statement: Statement,
params: I,
) -> Result<RowStream, Error>
where
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
I::IntoIter: ExactSizeIterator,
{
query::query(&self.inner, statement, params).await
}
/// Pass text directly to the Postgres backend to allow it to sort out typing itself and
/// to save a roundtrip
pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
@@ -195,14 +282,6 @@ impl Client {
simple_query::batch_execute(self.inner(), query).await
}
pub async fn discard_all(&mut self) -> Result<ReadyForQueryStatus, Error> {
// clear the prepared statements that are about to be nuked from the postgres session
self.cached_typeinfo.typeinfo = None;
self.batch_execute("discard all").await
}
/// Begins a new database transaction.
///
/// The transaction will roll back by default - use the `commit` method to commit it.
@@ -266,8 +345,8 @@ impl Client {
}
/// Query for type information
pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result<Type, Error> {
crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await
pub async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
crate::prepare::get_type(&self.inner, oid).await
}
/// Determines if the connection to the server has already closed.

View File

@@ -1,6 +1,5 @@
//! Connection configuration.
use std::net::IpAddr;
use std::time::Duration;
use std::{fmt, str};
@@ -66,7 +65,6 @@ pub enum AuthKeys {
/// Connection configuration.
#[derive(Clone, PartialEq, Eq)]
pub struct Config {
pub(crate) host_addr: Option<IpAddr>,
pub(crate) host: Host,
pub(crate) port: u16,
@@ -85,7 +83,6 @@ impl Config {
/// Creates a new configuration.
pub fn new(host: String, port: u16) -> Config {
Config {
host_addr: None,
host: Host::Tcp(host),
port,
password: None,
@@ -166,15 +163,6 @@ impl Config {
self
}
pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config {
self.host_addr = Some(addr);
self
}
pub fn get_host_addr(&self) -> Option<IpAddr> {
self.host_addr
}
/// Sets the SSL configuration.
///
/// Defaults to `prefer`.

View File

@@ -1,5 +1,3 @@
use std::net::IpAddr;
use postgres_protocol2::message::backend::Message;
use tokio::net::TcpStream;
use tokio::sync::mpsc;
@@ -27,14 +25,13 @@ where
.make_tls_connect(hostname)
.map_err(|e| Error::tls(e.into()))?;
match connect_once(config.host_addr, &config.host, config.port, tls, config).await {
match connect_once(&config.host, config.port, tls, config).await {
Ok((client, connection)) => Ok((client, connection)),
Err(e) => Err(e),
}
}
async fn connect_once<T>(
host_addr: Option<IpAddr>,
host: &Host,
port: u16,
tls: T,
@@ -43,7 +40,7 @@ async fn connect_once<T>(
where
T: TlsConnect<TcpStream>,
{
let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
let socket = connect_socket(host, port, config.connect_timeout).await?;
let RawConnection {
stream,
parameters,
@@ -53,7 +50,6 @@ where
} = connect_raw(socket, tls, config).await?;
let socket_config = SocketConfig {
host_addr,
host: host.clone(),
port,
connect_timeout: config.connect_timeout,

View File

@@ -1,6 +1,5 @@
use std::future::Future;
use std::io;
use std::net::{IpAddr, SocketAddr};
use std::time::Duration;
use tokio::net::{self, TcpStream};
@@ -10,20 +9,15 @@ use crate::Error;
use crate::config::Host;
pub(crate) async fn connect_socket(
host_addr: Option<IpAddr>,
host: &Host,
port: u16,
connect_timeout: Option<Duration>,
) -> Result<TcpStream, Error> {
match host {
Host::Tcp(host) => {
let addrs = match host_addr {
Some(addr) => vec![SocketAddr::new(addr, port)],
None => net::lookup_host((&**host, port))
.await
.map_err(Error::connect)?
.collect(),
};
let addrs = net::lookup_host((&**host, port))
.await
.map_err(Error::connect)?;
let mut last_err = None;

View File

@@ -22,7 +22,7 @@ pub trait GenericClient: private::Sealed {
I::IntoIter: ExactSizeIterator + Sync + Send;
/// Query for type information
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error>;
async fn get_type(&self, oid: Oid) -> Result<Type, Error>;
}
impl private::Sealed for Client {}
@@ -38,8 +38,8 @@ impl GenericClient for Client {
}
/// Query for type information
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
self.get_type_inner(oid).await
async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
crate::prepare::get_type(self.inner(), oid).await
}
}
@@ -56,7 +56,7 @@ impl GenericClient for Transaction<'_> {
}
/// Query for type information
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
self.client_mut().get_type(oid).await
async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
self.client().get_type(oid).await
}
}

View File

@@ -9,10 +9,10 @@ use log::debug;
use postgres_protocol2::message::backend::Message;
use postgres_protocol2::message::frontend;
use crate::client::{CachedTypeInfo, InnerClient};
use crate::client::InnerClient;
use crate::codec::FrontendMessage;
use crate::connection::RequestMessages;
use crate::types::{Kind, Oid, Type};
use crate::types::{Field, Kind, Oid, Type};
use crate::{Column, Error, Statement, query, slice_iter};
pub(crate) const TYPEINFO_QUERY: &str = "\
@@ -23,7 +23,23 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
WHERE t.oid = $1
";
async fn prepare_typecheck(
const TYPEINFO_ENUM_QUERY: &str = "\
SELECT enumlabel
FROM pg_catalog.pg_enum
WHERE enumtypid = $1
ORDER BY enumsortorder
";
pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
SELECT attname, atttypid
FROM pg_catalog.pg_attribute
WHERE attrelid = $1
AND NOT attisdropped
AND attnum > 0
ORDER BY attnum
";
pub async fn prepare(
client: &Arc<InnerClient>,
name: &'static str,
query: &str,
@@ -51,7 +67,7 @@ async fn prepare_typecheck(
let mut parameters = vec![];
let mut it = parameter_description.parameters();
while let Some(oid) = it.next().map_err(Error::parse)? {
let type_ = Type::from_oid(oid).ok_or_else(Error::unexpected_message)?;
let type_ = get_type(client, oid).await?;
parameters.push(type_);
}
@@ -59,7 +75,7 @@ async fn prepare_typecheck(
if let Some(row_description) = row_description {
let mut it = row_description.fields();
while let Some(field) = it.next().map_err(Error::parse)? {
let type_ = Type::from_oid(field.type_oid()).ok_or_else(Error::unexpected_message)?;
let type_ = get_type(client, field.type_oid()).await?;
let column = Column::new(field.name().to_string(), type_, field);
columns.push(column);
}
@@ -68,6 +84,15 @@ async fn prepare_typecheck(
Ok(Statement::new(client, name, parameters, columns))
}
fn prepare_rec<'a>(
client: &'a Arc<InnerClient>,
name: &'static str,
query: &'a str,
types: &'a [Type],
) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
Box::pin(prepare(client, name, query, types))
}
fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
if types.is_empty() {
debug!("preparing query {}: {}", name, query);
@@ -83,20 +108,16 @@ fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Resu
})
}
pub async fn get_type(
client: &Arc<InnerClient>,
typecache: &mut CachedTypeInfo,
oid: Oid,
) -> Result<Type, Error> {
pub async fn get_type(client: &Arc<InnerClient>, oid: Oid) -> Result<Type, Error> {
if let Some(type_) = Type::from_oid(oid) {
return Ok(type_);
}
if let Some(type_) = typecache.types.get(&oid) {
return Ok(type_.clone());
};
if let Some(type_) = client.type_(oid) {
return Ok(type_);
}
let stmt = typeinfo_statement(client, typecache).await?;
let stmt = typeinfo_statement(client).await?;
let rows = query::query(client, stmt, slice_iter(&[&oid])).await?;
pin_mut!(rows);
@@ -115,48 +136,100 @@ pub async fn get_type(
let relid: Oid = row.try_get(6)?;
let kind = if type_ == b'e' as i8 {
Kind::Enum
let variants = get_enum_variants(client, oid).await?;
Kind::Enum(variants)
} else if type_ == b'p' as i8 {
Kind::Pseudo
} else if basetype != 0 {
Kind::Domain(basetype)
let type_ = get_type_rec(client, basetype).await?;
Kind::Domain(type_)
} else if elem_oid != 0 {
let type_ = get_type_rec(client, typecache, elem_oid).await?;
let type_ = get_type_rec(client, elem_oid).await?;
Kind::Array(type_)
} else if relid != 0 {
Kind::Composite(relid)
let fields = get_composite_fields(client, relid).await?;
Kind::Composite(fields)
} else if let Some(rngsubtype) = rngsubtype {
let type_ = get_type_rec(client, typecache, rngsubtype).await?;
let type_ = get_type_rec(client, rngsubtype).await?;
Kind::Range(type_)
} else {
Kind::Simple
};
let type_ = Type::new(name, oid, kind, schema);
typecache.types.insert(oid, type_.clone());
client.set_type(oid, &type_);
Ok(type_)
}
fn get_type_rec<'a>(
client: &'a Arc<InnerClient>,
typecache: &'a mut CachedTypeInfo,
oid: Oid,
) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + 'a>> {
Box::pin(get_type(client, typecache, oid))
Box::pin(get_type(client, oid))
}
async fn typeinfo_statement(
client: &Arc<InnerClient>,
typecache: &mut CachedTypeInfo,
) -> Result<Statement, Error> {
if let Some(stmt) = &typecache.typeinfo {
return Ok(stmt.clone());
async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
if let Some(stmt) = client.typeinfo() {
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo";
let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
typecache.typeinfo = Some(stmt.clone());
client.set_typeinfo(&stmt);
Ok(stmt)
}
async fn get_enum_variants(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<String>, Error> {
let stmt = typeinfo_enum_statement(client).await?;
query::query(client, stmt, slice_iter(&[&oid]))
.await?
.and_then(|row| async move { row.try_get(0) })
.try_collect()
.await
}
async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
if let Some(stmt) = client.typeinfo_enum() {
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo_enum";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?;
client.set_typeinfo_enum(&stmt);
Ok(stmt)
}
async fn get_composite_fields(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<Field>, Error> {
let stmt = typeinfo_composite_statement(client).await?;
let rows = query::query(client, stmt, slice_iter(&[&oid]))
.await?
.try_collect::<Vec<_>>()
.await?;
let mut fields = vec![];
for row in rows {
let name = row.try_get(0)?;
let oid = row.try_get(1)?;
let type_ = get_type_rec(client, oid).await?;
fields.push(Field::new(name, type_));
}
Ok(fields)
}
async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
if let Some(stmt) = client.typeinfo_composite() {
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo_composite";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
client.set_typeinfo_composite(&stmt);
Ok(stmt)
}

View File

@@ -72,9 +72,4 @@ impl<'a> Transaction<'a> {
pub fn client(&self) -> &Client {
self.client
}
/// Returns a reference to the underlying `Client`.
pub fn client_mut(&mut self) -> &mut Client {
self.client
}
}

Some files were not shown because too many files have changed in this diff Show More