mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 17:32:56 +00:00
Merge WITH CONFLICTS 2025-03-11 main commit '7c462b3417ecd3ae3907f3480f3b8a8c99fc6d7b' into yuchen/dire
ct-io-delta-image-layer-write Conflicts: pageserver/src/tenant/blob_io.rs
This commit is contained in:
3
.github/actionlint.yml
vendored
3
.github/actionlint.yml
vendored
@@ -32,3 +32,6 @@ config-variables:
|
||||
- NEON_DEV_AWS_ACCOUNT_ID
|
||||
- NEON_PROD_AWS_ACCOUNT_ID
|
||||
- AWS_ECR_REGION
|
||||
- BENCHMARK_LARGE_OLTP_PROJECTID
|
||||
- SLACK_ON_CALL_DEVPROD_STREAM
|
||||
- SLACK_RUST_CHANNEL_ID
|
||||
|
||||
12
.github/actions/neon-branch-create/action.yml
vendored
12
.github/actions/neon-branch-create/action.yml
vendored
@@ -84,7 +84,13 @@ runs:
|
||||
--header "Authorization: Bearer ${API_KEY}"
|
||||
)
|
||||
|
||||
role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
|
||||
role_name=$(echo "$roles" | jq --raw-output '
|
||||
(.roles | map(select(.protected == false))) as $roles |
|
||||
if any($roles[]; .name == "neondb_owner")
|
||||
then "neondb_owner"
|
||||
else $roles[0].name
|
||||
end
|
||||
')
|
||||
echo "role_name=${role_name}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
API_HOST: ${{ inputs.api_host }}
|
||||
@@ -107,13 +113,13 @@ runs:
|
||||
)
|
||||
|
||||
if [ -z "${reset_password}" ]; then
|
||||
sleep 1
|
||||
sleep $i
|
||||
continue
|
||||
fi
|
||||
|
||||
password=$(echo $reset_password | jq --raw-output '.role.password')
|
||||
if [ "${password}" == "null" ]; then
|
||||
sleep 1
|
||||
sleep $i # increasing backoff
|
||||
continue
|
||||
fi
|
||||
|
||||
|
||||
@@ -44,6 +44,11 @@ inputs:
|
||||
description: 'Postgres version to use for tests'
|
||||
required: false
|
||||
default: 'v16'
|
||||
sanitizers:
|
||||
description: 'enabled or disabled'
|
||||
required: false
|
||||
default: 'disabled'
|
||||
type: string
|
||||
benchmark_durations:
|
||||
description: 'benchmark durations JSON'
|
||||
required: false
|
||||
@@ -59,7 +64,7 @@ runs:
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
|
||||
path: /tmp/neon
|
||||
aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
|
||||
|
||||
@@ -112,6 +117,7 @@ runs:
|
||||
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
|
||||
RERUN_FAILED: ${{ inputs.rerun_failed }}
|
||||
PG_VERSION: ${{ inputs.pg_version }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# PLATFORM will be embedded in the perf test report
|
||||
|
||||
25
.github/scripts/previous-releases.jq
vendored
Normal file
25
.github/scripts/previous-releases.jq
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
# Expects response from https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#list-releases as input,
|
||||
# with tag names `release` for storage, `release-compute` for compute and `release-proxy` for proxy releases.
|
||||
# Extract only the `tag_name` field from each release object
|
||||
[ .[].tag_name ]
|
||||
|
||||
# Transform each tag name into a structured object using regex capture
|
||||
| reduce map(
|
||||
capture("^(?<full>release(-(?<component>proxy|compute))?-(?<version>\\d+))$")
|
||||
| {
|
||||
component: (.component // "storage"), # Default to "storage" if no component is specified
|
||||
version: (.version | tonumber), # Convert the version number to an integer
|
||||
full: .full # Store the full tag name for final output
|
||||
}
|
||||
)[] as $entry # Loop over the transformed list
|
||||
|
||||
# Accumulate the latest (highest-numbered) version for each component
|
||||
({};
|
||||
.[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end))
|
||||
|
||||
# Convert the resulting object into an array of formatted strings
|
||||
| to_entries
|
||||
| map("\(.key)=\(.value.full)")
|
||||
|
||||
# Output each string separately
|
||||
| .[]
|
||||
@@ -280,7 +280,7 @@ jobs:
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
|
||||
path: /tmp/neon
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
@@ -347,6 +347,7 @@ jobs:
|
||||
real_s3_region: eu-central-1
|
||||
rerun_failed: true
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
sanitizers: ${{ inputs.sanitizers }}
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
# `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
|
||||
# Attempt to stop tests gracefully to generate test reports
|
||||
@@ -359,7 +360,6 @@ jobs:
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
|
||||
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
|
||||
107
.github/workflows/_meta.yml
vendored
Normal file
107
.github/workflows/_meta.yml
vendored
Normal file
@@ -0,0 +1,107 @@
|
||||
name: Generate run metadata
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
github-event-name:
|
||||
type: string
|
||||
required: true
|
||||
outputs:
|
||||
build-tag:
|
||||
description: "Tag for the current workflow run"
|
||||
value: ${{ jobs.tags.outputs.build-tag }}
|
||||
previous-storage-release:
|
||||
description: "Tag of the last storage release"
|
||||
value: ${{ jobs.tags.outputs.storage }}
|
||||
previous-proxy-release:
|
||||
description: "Tag of the last proxy release"
|
||||
value: ${{ jobs.tags.outputs.proxy }}
|
||||
previous-compute-release:
|
||||
description: "Tag of the last compute release"
|
||||
value: ${{ jobs.tags.outputs.compute }}
|
||||
run-kind:
|
||||
description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`"
|
||||
value: ${{ jobs.tags.outputs.run-kind }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tags:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
build-tag: ${{ steps.build-tag.outputs.tag }}
|
||||
compute: ${{ steps.previous-releases.outputs.compute }}
|
||||
proxy: ${{ steps.previous-releases.outputs.proxy }}
|
||||
storage: ${{ steps.previous-releases.outputs.storage }}
|
||||
run-kind: ${{ steps.run-kind.outputs.run-kind }}
|
||||
permissions:
|
||||
contents: read
|
||||
steps:
|
||||
# Need `fetch-depth: 0` to count the number of commits in the branch
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get run kind
|
||||
id: run-kind
|
||||
env:
|
||||
RUN_KIND: >-
|
||||
${{
|
||||
false
|
||||
|| (inputs.github-event-name == 'push' && github.ref_name == 'main') && 'push-main'
|
||||
|| (inputs.github-event-name == 'push' && github.ref_name == 'release') && 'storage-release'
|
||||
|| (inputs.github-event-name == 'push' && github.ref_name == 'release-compute') && 'compute-release'
|
||||
|| (inputs.github-event-name == 'push' && github.ref_name == 'release-proxy') && 'proxy-release'
|
||||
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release') && 'storage-rc-pr'
|
||||
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr'
|
||||
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr'
|
||||
|| (inputs.github-event-name == 'pull_request') && 'pr'
|
||||
|| (inputs.github-event-name == 'workflow_dispatch') && 'workflow-dispatch'
|
||||
|| 'unknown'
|
||||
}}
|
||||
run: |
|
||||
echo "run-kind=$RUN_KIND" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- name: Get build tag
|
||||
id: build-tag
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
RUN_KIND: ${{ steps.run-kind.outputs.run-kind }}
|
||||
run: |
|
||||
case $RUN_KIND in
|
||||
push-main)
|
||||
echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
;;
|
||||
storage-release)
|
||||
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
;;
|
||||
proxy-release)
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
;;
|
||||
compute-release)
|
||||
echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
;;
|
||||
pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr)
|
||||
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
|
||||
echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
|
||||
;;
|
||||
workflow-dispatch)
|
||||
echo "tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT
|
||||
;;
|
||||
*)
|
||||
echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!"
|
||||
exit 1
|
||||
esac
|
||||
|
||||
- name: Get the previous release-tags
|
||||
id: previous-releases
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
gh api --paginate \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"/repos/${GITHUB_REPOSITORY}/releases" \
|
||||
| jq -f .github/scripts/previous-releases.jq -r \
|
||||
| tee -a "${GITHUB_OUTPUT}"
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
sparse-checkout: scripts/push_with_image_map.py
|
||||
sparse-checkout: .github/scripts/push_with_image_map.py
|
||||
sparse-checkout-cone-mode: false
|
||||
|
||||
- name: Print image-map
|
||||
@@ -99,6 +99,6 @@ jobs:
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Copy docker images to target registries
|
||||
run: python scripts/push_with_image_map.py
|
||||
run: python3 .github/scripts/push_with_image_map.py
|
||||
env:
|
||||
IMAGE_MAP: ${{ inputs.image-map }}
|
||||
|
||||
61
.github/workflows/benchmarking.yml
vendored
61
.github/workflows/benchmarking.yml
vendored
@@ -140,6 +140,9 @@ jobs:
|
||||
--ignore test_runner/performance/test_logical_replication.py
|
||||
--ignore test_runner/performance/test_physical_replication.py
|
||||
--ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
|
||||
--ignore test_runner/performance/test_cumulative_statistics_persistence.py
|
||||
--ignore test_runner/performance/test_perf_many_relations.py
|
||||
--ignore test_runner/performance/test_perf_oltp_large_tenant.py
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -171,6 +174,61 @@ jobs:
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
cumstats-test:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 17
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-staging"
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Verify that cumulative statistics are preserved
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_cumulative_statistics_persistence.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 3600
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
replication-tests:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
@@ -398,6 +456,9 @@ jobs:
|
||||
runs-on: ${{ matrix.runner }}
|
||||
container:
|
||||
image: ${{ matrix.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
|
||||
346
.github/workflows/build_and_test.yml
vendored
346
.github/workflows/build_and_test.yml
vendored
@@ -65,38 +65,11 @@ jobs:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
filters: .github/file-filters.yaml
|
||||
|
||||
tag:
|
||||
meta:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
|
||||
outputs:
|
||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||
|
||||
steps:
|
||||
# Need `fetch-depth: 0` to count the number of commits in the branch
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get build tag
|
||||
run: |
|
||||
echo run:$GITHUB_RUN_ID
|
||||
echo ref:$GITHUB_REF_NAME
|
||||
echo rev:$(git rev-list --count HEAD)
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
|
||||
echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
|
||||
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
shell: bash
|
||||
id: build-tag
|
||||
uses: ./.github/workflows/_meta.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name }}
|
||||
|
||||
build-build-tools-image:
|
||||
needs: [ check-permissions ]
|
||||
@@ -199,7 +172,7 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ tag, build-build-tools-image ]
|
||||
needs: [ meta, build-build-tools-image ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -213,7 +186,7 @@ jobs:
|
||||
with:
|
||||
arch: ${{ matrix.arch }}
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-tag: ${{ needs.meta.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
|
||||
# Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled.
|
||||
@@ -497,13 +470,24 @@ jobs:
|
||||
})
|
||||
|
||||
trigger-e2e-tests:
|
||||
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
|
||||
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ]
|
||||
# Depends on jobs that can get skipped
|
||||
if: >-
|
||||
${{
|
||||
(
|
||||
!github.event.pull_request.draft
|
||||
|| contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft')
|
||||
|| contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind)
|
||||
) && !failure() && !cancelled()
|
||||
}}
|
||||
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ]
|
||||
uses: ./.github/workflows/trigger-e2e-tests.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name }}
|
||||
secrets: inherit
|
||||
|
||||
neon-image-arch:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
needs: [ check-permissions, build-build-tools-image, meta ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
@@ -539,7 +523,7 @@ jobs:
|
||||
build-args: |
|
||||
ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }}
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
BUILD_TAG=${{ needs.meta.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm
|
||||
DEBIAN_VERSION=bookworm
|
||||
provenance: false
|
||||
@@ -549,10 +533,11 @@ jobs:
|
||||
cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }}
|
||||
neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }}
|
||||
|
||||
neon-image:
|
||||
needs: [ neon-image-arch, tag ]
|
||||
needs: [ neon-image-arch, meta ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
@@ -567,13 +552,14 @@ jobs:
|
||||
|
||||
- name: Create multi-arch image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
|
||||
-t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64
|
||||
docker buildx imagetools create -t neondatabase/neon:${{ needs.meta.outputs.build-tag }} \
|
||||
-t neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \
|
||||
neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \
|
||||
neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64
|
||||
|
||||
compute-node-image-arch:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
needs: [ check-permissions, build-build-tools-image, meta ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
@@ -631,7 +617,7 @@ jobs:
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
BUILD_TAG=${{ needs.meta.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
|
||||
DEBIAN_VERSION=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
@@ -641,7 +627,7 @@ jobs:
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
|
||||
- name: Build neon extensions test image
|
||||
if: matrix.version.pg >= 'v16'
|
||||
@@ -651,7 +637,7 @@ jobs:
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
BUILD_TAG=${{ needs.meta.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
|
||||
DEBIAN_VERSION=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
@@ -661,10 +647,11 @@ jobs:
|
||||
target: extension-tests
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
|
||||
compute-node-image:
|
||||
needs: [ compute-node-image-arch, tag ]
|
||||
needs: [ compute-node-image-arch, meta ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
@@ -692,27 +679,28 @@ jobs:
|
||||
|
||||
- name: Create multi-arch compute-node image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
|
||||
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
|
||||
-t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
|
||||
|
||||
- name: Create multi-arch neon-test-extensions image
|
||||
if: matrix.version.pg >= 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
|
||||
docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
|
||||
-t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ check-permissions, tag, compute-node-image ]
|
||||
runs-on: [ self-hosted, large ]
|
||||
vm-compute-node-image-arch:
|
||||
needs: [ check-permissions, meta, compute-node-image ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ amd64, arm64 ]
|
||||
version:
|
||||
# see the comment for `compute-node-image-arch` job
|
||||
- pg: v14
|
||||
debian: bullseye
|
||||
- pg: v15
|
||||
@@ -722,14 +710,14 @@ jobs:
|
||||
- pg: v17
|
||||
debian: bookworm
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.37.1
|
||||
VM_BUILDER_VERSION: v0.42.2
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Downloading vm-builder
|
||||
run: |
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
@@ -742,22 +730,50 @@ jobs:
|
||||
# it won't have the proper authentication (written at v0.6.0)
|
||||
- name: Pulling compute-node image
|
||||
run: |
|
||||
docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
|
||||
docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}
|
||||
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-size=2G \
|
||||
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
|
||||
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
|
||||
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \
|
||||
-target-arch=linux/${{ matrix.arch }}
|
||||
|
||||
- name: Pushing vm-compute-node image
|
||||
run: |
|
||||
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
|
||||
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ vm-compute-node-image-arch, meta ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
runs-on: ubuntu-22.04
|
||||
strategy:
|
||||
matrix:
|
||||
version:
|
||||
# see the comment for `compute-node-image-arch` job
|
||||
- pg: v14
|
||||
- pg: v15
|
||||
- pg: v16
|
||||
- pg: v17
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Create multi-arch compute-node image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
|
||||
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \
|
||||
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64
|
||||
|
||||
|
||||
test-images:
|
||||
needs: [ check-permissions, tag, neon-image, compute-node-image ]
|
||||
needs: [ check-permissions, meta, neon-image, compute-node-image ]
|
||||
# Depends on jobs that can get skipped
|
||||
if: "!failure() && !cancelled()"
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -775,17 +791,6 @@ jobs:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Get the last compute release tag
|
||||
id: get-last-compute-release-tag
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"/repos/${{ github.repository }}/releases")
|
||||
echo tag=${tag} >> ${GITHUB_OUTPUT}
|
||||
|
||||
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
|
||||
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
|
||||
# Regular pageserver version string looks like
|
||||
@@ -795,8 +800,9 @@ jobs:
|
||||
# Ensure that we don't have bad versions.
|
||||
- name: Verify image versions
|
||||
shell: bash # ensure no set -e for better error messages
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
run: |
|
||||
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||
|
||||
echo "Pageserver version string: $pageserver_version"
|
||||
|
||||
@@ -813,7 +819,24 @@ jobs:
|
||||
- name: Verify docker-compose example and test extensions
|
||||
timeout-minutes: 20
|
||||
env:
|
||||
TAG: ${{needs.tag.outputs.build-tag}}
|
||||
TAG: >-
|
||||
${{
|
||||
contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)
|
||||
&& needs.meta.outputs.previous-storage-release
|
||||
|| needs.meta.outputs.build-tag
|
||||
}}
|
||||
COMPUTE_TAG: >-
|
||||
${{
|
||||
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
|
||||
&& needs.meta.outputs.previous-compute-release
|
||||
|| needs.meta.outputs.build-tag
|
||||
}}
|
||||
TEST_EXTENSIONS_TAG: >-
|
||||
${{
|
||||
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
|
||||
&& 'latest'
|
||||
|| needs.meta.outputs.build-tag
|
||||
}}
|
||||
TEST_VERSION_ONLY: ${{ matrix.pg_version }}
|
||||
run: ./docker-compose/docker_compose_test.sh
|
||||
|
||||
@@ -825,10 +848,17 @@ jobs:
|
||||
|
||||
- name: Test extension upgrade
|
||||
timeout-minutes: 20
|
||||
if: ${{ needs.tag.outputs.build-tag == github.run_id }}
|
||||
if: ${{ contains(fromJSON('["pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
env:
|
||||
NEWTAG: ${{ needs.tag.outputs.build-tag }}
|
||||
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
|
||||
TAG: >-
|
||||
${{
|
||||
false
|
||||
|| needs.meta.outputs.run-kind == 'pr' && needs.meta.outputs.build-tag
|
||||
|| needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release
|
||||
}}
|
||||
TEST_EXTENSIONS_TAG: ${{ needs.meta.outputs.previous-compute-release }}
|
||||
NEW_COMPUTE_TAG: ${{ needs.meta.outputs.build-tag }}
|
||||
OLD_COMPUTE_TAG: ${{ needs.meta.outputs.previous-compute-release }}
|
||||
run: ./docker-compose/test_extensions_upgrade.sh
|
||||
|
||||
- name: Print logs and clean up
|
||||
@@ -838,7 +868,7 @@ jobs:
|
||||
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
generate-image-maps:
|
||||
needs: [ tag ]
|
||||
needs: [ meta ]
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
neon-dev: ${{ steps.generate.outputs.neon-dev }}
|
||||
@@ -848,14 +878,14 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
sparse-checkout: scripts/generate_image_maps.py
|
||||
sparse-checkout: .github/scripts/generate_image_maps.py
|
||||
sparse-checkout-cone-mode: false
|
||||
|
||||
- name: Generate Image Maps
|
||||
id: generate
|
||||
run: python scripts/generate_image_maps.py
|
||||
run: python3 .github/scripts/generate_image_maps.py
|
||||
env:
|
||||
BUILD_TAG: "${{ needs.tag.outputs.build-tag }}"
|
||||
BUILD_TAG: "${{ needs.meta.outputs.build-tag }}"
|
||||
BRANCH: "${{ github.ref_name }}"
|
||||
DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
|
||||
PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
|
||||
@@ -864,7 +894,8 @@ jobs:
|
||||
AWS_REGION: "${{ vars.AWS_ECR_REGION }}"
|
||||
|
||||
push-neon-image-dev:
|
||||
needs: [ generate-image-maps, neon-image ]
|
||||
needs: [ meta, generate-image-maps, neon-image ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
uses: ./.github/workflows/_push-to-container-registry.yml
|
||||
permissions:
|
||||
id-token: write # Required for aws/azure login
|
||||
@@ -881,7 +912,8 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
push-compute-image-dev:
|
||||
needs: [ generate-image-maps, vm-compute-node-image ]
|
||||
needs: [ meta, generate-image-maps, vm-compute-node-image ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
uses: ./.github/workflows/_push-to-container-registry.yml
|
||||
permissions:
|
||||
id-token: write # Required for aws/azure login
|
||||
@@ -898,8 +930,9 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
push-neon-image-prod:
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
needs: [ generate-image-maps, neon-image, test-images ]
|
||||
needs: [ meta, generate-image-maps, neon-image, test-images ]
|
||||
# Depends on jobs that can get skipped
|
||||
if: ${{ !failure() && !cancelled() && contains(fromJSON('["storage-release", "proxy-release"]'), needs.meta.outputs.run-kind) }}
|
||||
uses: ./.github/workflows/_push-to-container-registry.yml
|
||||
permissions:
|
||||
id-token: write # Required for aws/azure login
|
||||
@@ -916,8 +949,9 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
push-compute-image-prod:
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
needs: [ generate-image-maps, vm-compute-node-image, test-images ]
|
||||
needs: [ meta, generate-image-maps, vm-compute-node-image, test-images ]
|
||||
# Depends on jobs that can get skipped
|
||||
if: ${{ !failure() && !cancelled() && needs.meta.outputs.run-kind == 'compute-release' }}
|
||||
uses: ./.github/workflows/_push-to-container-registry.yml
|
||||
permissions:
|
||||
id-token: write # Required for aws/azure login
|
||||
@@ -936,18 +970,19 @@ jobs:
|
||||
# This is a bit of a special case so we're not using a generated image map.
|
||||
add-latest-tag-to-neon-extensions-test-image:
|
||||
if: github.ref_name == 'main'
|
||||
needs: [ tag, compute-node-image ]
|
||||
needs: [ meta, compute-node-image ]
|
||||
uses: ./.github/workflows/_push-to-container-registry.yml
|
||||
with:
|
||||
image-map: |
|
||||
{
|
||||
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
|
||||
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
|
||||
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
|
||||
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
|
||||
}
|
||||
secrets: inherit
|
||||
|
||||
trigger-custom-extensions-build-and-wait:
|
||||
needs: [ check-permissions, tag ]
|
||||
needs: [ check-permissions, meta ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
@@ -982,7 +1017,7 @@ jobs:
|
||||
\"ci_job_name\": \"build-and-upload-extensions\",
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"compute_image_tag\": \"${{ needs.meta.outputs.build-tag }}\",
|
||||
\"remote_branch_name\": \"${{ github.ref_name }}\"
|
||||
}
|
||||
}"
|
||||
@@ -1026,9 +1061,9 @@ jobs:
|
||||
exit 1
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
|
||||
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
|
||||
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
|
||||
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
@@ -1039,108 +1074,103 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Create git tag and GitHub release
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
if: ${{ contains(fromJSON('["storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) }}
|
||||
uses: actions/github-script@v7
|
||||
env:
|
||||
TAG: "${{ needs.meta.outputs.build-tag }}"
|
||||
BRANCH: "${{ github.ref_name }}"
|
||||
PREVIOUS_RELEASE: >-
|
||||
${{
|
||||
false
|
||||
|| needs.meta.outputs.run-kind == 'storage-release' && needs.meta.outputs.previous-storage-release
|
||||
|| needs.meta.outputs.run-kind == 'proxy-release' && needs.meta.outputs.previous-proxy-release
|
||||
|| needs.meta.outputs.run-kind == 'compute-release' && needs.meta.outputs.previous-compute-release
|
||||
|| 'unknown'
|
||||
}}
|
||||
with:
|
||||
retries: 5
|
||||
script: |
|
||||
const tag = "${{ needs.tag.outputs.build-tag }}";
|
||||
const branch = "${{ github.ref_name }}";
|
||||
const { TAG, BRANCH, PREVIOUS_RELEASE } = process.env
|
||||
|
||||
try {
|
||||
const existingRef = await github.rest.git.getRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: `tags/${tag}`,
|
||||
ref: `tags/${TAG}`,
|
||||
});
|
||||
|
||||
if (existingRef.data.object.sha !== context.sha) {
|
||||
throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
|
||||
throw new Error(`Tag ${TAG} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
|
||||
}
|
||||
|
||||
console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`);
|
||||
console.log(`Tag ${TAG} already exists and points to ${context.sha} as expected.`);
|
||||
} catch (error) {
|
||||
if (error.status !== 404) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
console.log(`Tag ${tag} does not exist. Creating it...`);
|
||||
console.log(`Tag ${TAG} does not exist. Creating it...`);
|
||||
await github.rest.git.createRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: `refs/tags/${tag}`,
|
||||
ref: `refs/tags/${TAG}`,
|
||||
sha: context.sha,
|
||||
});
|
||||
console.log(`Tag ${tag} created successfully.`);
|
||||
console.log(`Tag ${TAG} created successfully.`);
|
||||
}
|
||||
|
||||
try {
|
||||
const existingRelease = await github.rest.repos.getReleaseByTag({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
tag: tag,
|
||||
tag: TAG,
|
||||
});
|
||||
|
||||
console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`);
|
||||
console.log(`Release for tag ${TAG} already exists (ID: ${existingRelease.data.id}).`);
|
||||
} catch (error) {
|
||||
if (error.status !== 404) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
console.log(`Release for tag ${tag} does not exist. Creating it...`);
|
||||
console.log(`Release for tag ${TAG} does not exist. Creating it...`);
|
||||
|
||||
// Find the PR number using the commit SHA
|
||||
const pullRequests = await github.rest.pulls.list({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
state: 'closed',
|
||||
base: branch,
|
||||
base: BRANCH,
|
||||
});
|
||||
|
||||
const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha);
|
||||
const prNumber = pr ? pr.number : null;
|
||||
|
||||
// Find the previous release on the branch
|
||||
const releases = await github.rest.repos.listReleases({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
per_page: 100,
|
||||
});
|
||||
|
||||
const branchReleases = releases.data
|
||||
.filter((release) => {
|
||||
const regex = new RegExp(`^${branch}-\\d+$`);
|
||||
return regex.test(release.tag_name) && !release.draft && !release.prerelease;
|
||||
})
|
||||
.sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
|
||||
|
||||
const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null;
|
||||
|
||||
const releaseNotes = [
|
||||
prNumber
|
||||
? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.`
|
||||
: 'Release PR not found.',
|
||||
previousTag
|
||||
? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.`
|
||||
: `No previous release found on branch ${branch}.`,
|
||||
`Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${PREVIOUS_RELEASE}...${TAG}.`
|
||||
].join('\n\n');
|
||||
|
||||
await github.rest.repos.createRelease({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
tag_name: tag,
|
||||
tag_name: TAG,
|
||||
body: releaseNotes,
|
||||
});
|
||||
console.log(`Release for tag ${tag} created successfully.`);
|
||||
console.log(`Release for tag ${TAG} created successfully.`);
|
||||
}
|
||||
|
||||
- name: Trigger deploy workflow
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
RUN_KIND: ${{ needs.meta.outputs.run-kind }}
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
case ${RUN_KIND} in
|
||||
push-main)
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.meta.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
;;
|
||||
storage-release)
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
@@ -1148,7 +1178,7 @@ jobs:
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f dockerTag=${{needs.meta.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
|
||||
@@ -1156,8 +1186,9 @@ jobs:
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
-f dockerTag=${{needs.meta.outputs.build-tag}}
|
||||
;;
|
||||
proxy-release)
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
@@ -1165,7 +1196,7 @@ jobs:
|
||||
-f deployStorageBroker=false \
|
||||
-f deployStorageController=false \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f dockerTag=${{needs.meta.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
||||
@@ -1175,13 +1206,16 @@ jobs:
|
||||
-f deployProxyScram=true \
|
||||
-f deployProxyAuthBroker=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
|
||||
gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'"
|
||||
-f dockerTag=${{needs.meta.outputs.build-tag}}
|
||||
;;
|
||||
compute-release)
|
||||
gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.meta.outputs.build-tag}}
|
||||
;;
|
||||
*)
|
||||
echo "RUN_KIND (value '${RUN_KIND}') is not set to either 'push-main', 'storage-release', 'proxy-release' or 'compute-release'"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
notify-storage-release-deploy-failure:
|
||||
needs: [ deploy ]
|
||||
@@ -1197,7 +1231,7 @@ jobs:
|
||||
payload: |
|
||||
channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
|
||||
text: |
|
||||
🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
|
||||
🔴 <!subteam^S06CJ87UMNY|@oncall-storage>: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
|
||||
|
||||
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
|
||||
promote-compatibility-data:
|
||||
@@ -1206,7 +1240,7 @@ jobs:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: read
|
||||
# `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
|
||||
# `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
|
||||
if: github.ref_name == 'release' && !failure() && !cancelled()
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -1296,7 +1330,8 @@ jobs:
|
||||
|
||||
pin-build-tools-image:
|
||||
needs: [ build-build-tools-image, test-images, build-and-test-locally ]
|
||||
if: github.ref_name == 'main'
|
||||
# `!failure() && !cancelled()` is required because the job (transitively) depends on jobs that can be skipped
|
||||
if: github.ref_name == 'main' && !failure() && !cancelled()
|
||||
uses: ./.github/workflows/pin-build-tools-image.yml
|
||||
with:
|
||||
from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
@@ -1315,6 +1350,7 @@ jobs:
|
||||
# Format `needs` differently to make the list more readable.
|
||||
# Usually we do `needs: [...]`
|
||||
needs:
|
||||
- meta
|
||||
- build-and-test-locally
|
||||
- check-codestyle-python
|
||||
- check-codestyle-rust
|
||||
@@ -1338,7 +1374,7 @@ jobs:
|
||||
|| needs.check-codestyle-python.result == 'skipped'
|
||||
|| needs.check-codestyle-rust.result == 'skipped'
|
||||
|| needs.files-changed.result == 'skipped'
|
||||
|| needs.push-compute-image-dev.result == 'skipped'
|
||||
|| needs.push-neon-image-dev.result == 'skipped'
|
||||
|| (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|
||||
|| (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind))
|
||||
|| needs.test-images.result == 'skipped'
|
||||
|| needs.trigger-custom-extensions-build-and-wait.result == 'skipped'
|
||||
|| (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|
||||
|
||||
7
.github/workflows/cargo-deny.yml
vendored
7
.github/workflows/cargo-deny.yml
vendored
@@ -7,7 +7,7 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
schedule:
|
||||
- cron: '0 0 * * *'
|
||||
- cron: '0 10 * * *'
|
||||
|
||||
jobs:
|
||||
cargo-deny:
|
||||
@@ -50,8 +50,9 @@ jobs:
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
|
||||
channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
|
||||
text: |
|
||||
Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
Pinging @oncall-devprod.
|
||||
Fixing the problem should be fairly straight forward from the logs. If not, <#${{ vars.SLACK_RUST_CHANNEL_ID }}> is there to help.
|
||||
Pinging <!subteam^S0838JPSH32|@oncall-devprod>.
|
||||
|
||||
3
.github/workflows/cloud-regress.yml
vendored
3
.github/workflows/cloud-regress.yml
vendored
@@ -38,6 +38,9 @@ jobs:
|
||||
runs-on: us-east-2
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
|
||||
@@ -52,8 +52,9 @@ jobs:
|
||||
- name: Test extension upgrade
|
||||
timeout-minutes: 20
|
||||
env:
|
||||
NEWTAG: latest
|
||||
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
|
||||
NEW_COMPUTE_TAG: latest
|
||||
OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
|
||||
TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
|
||||
PG_VERSION: ${{ matrix.pg-version }}
|
||||
FORCE_ALL_UPGRADE_TESTS: true
|
||||
run: ./docker-compose/test_extensions_upgrade.sh
|
||||
|
||||
147
.github/workflows/large_oltp_benchmark.yml
vendored
Normal file
147
.github/workflows/large_oltp_benchmark.yml
vendored
Normal file
@@ -0,0 +1,147 @@
|
||||
name: large oltp benchmark
|
||||
|
||||
on:
|
||||
# uncomment to run on push for debugging your PR
|
||||
push:
|
||||
branches: [ bodobolero/synthetic_oltp_workload ]
|
||||
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow globally because we need dedicated resources which only exist once
|
||||
group: large-oltp-bench-workflow
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
oltp:
|
||||
strategy:
|
||||
fail-fast: false # allow other variants to continue even if one fails
|
||||
matrix:
|
||||
include:
|
||||
- target: new_branch
|
||||
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
|
||||
- target: reuse_branch
|
||||
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
|
||||
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h
|
||||
TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
PG_VERSION: 16 # pre-determined by pre-determined project
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
|
||||
PLATFORM: ${{ matrix.target }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
timeout-minutes: 480
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials # necessary to download artefacts
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create Neon Branch for large tenant
|
||||
if: ${{ matrix.target == 'new_branch' }}
|
||||
id: create-neon-branch-oltp-target
|
||||
uses: ./.github/actions/neon-branch-create
|
||||
with:
|
||||
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${{ matrix.target }}" in
|
||||
new_branch)
|
||||
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
|
||||
;;
|
||||
reuse_branch)
|
||||
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown target=${{ matrix.target }}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Benchmark pgbench with custom-scripts
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
|
||||
pg_version: ${{ env.PG_VERSION }}
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Delete Neon Branch for large tenant
|
||||
if: ${{ always() && matrix.target == 'new_branch' }}
|
||||
uses: ./.github/actions/neon-branch-delete
|
||||
with:
|
||||
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
|
||||
branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||
slack-message: |
|
||||
Periodic large oltp perf testing: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -71,7 +71,7 @@ jobs:
|
||||
uses: ./.github/workflows/build-macos.yml
|
||||
with:
|
||||
pg_versions: ${{ needs.files-changed.outputs.postgres_changes }}
|
||||
rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }}
|
||||
rebuild_rust_code: ${{ fromJson(needs.files-changed.outputs.rebuild_rust_code) }}
|
||||
rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }}
|
||||
|
||||
gather-rust-build-stats:
|
||||
|
||||
16
.github/workflows/periodic_pagebench.yml
vendored
16
.github/workflows/periodic_pagebench.yml
vendored
@@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 */3 * * *' # Runs every 3 hours
|
||||
workflow_dispatch: # Allows manual triggering of the workflow
|
||||
inputs:
|
||||
commit_hash:
|
||||
@@ -78,8 +78,10 @@ jobs:
|
||||
run: |
|
||||
if [ -z "$INPUT_COMMIT_HASH" ]; then
|
||||
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
|
||||
echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
|
||||
else
|
||||
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
|
||||
echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Start Bench with run_id
|
||||
@@ -89,7 +91,7 @@ jobs:
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
|
||||
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
|
||||
|
||||
- name: Poll Test Status
|
||||
id: poll_step
|
||||
|
||||
59
.github/workflows/trigger-e2e-tests.yml
vendored
59
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -5,6 +5,10 @@ on:
|
||||
types:
|
||||
- ready_for_review
|
||||
workflow_call:
|
||||
inputs:
|
||||
github-event-name:
|
||||
type: string
|
||||
required: true
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -19,7 +23,7 @@ jobs:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name }}
|
||||
github-event-name: ${{ inputs.github-event-name || github.event_name }}
|
||||
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
@@ -35,46 +39,29 @@ jobs:
|
||||
run cancel-previous-in-concurrency-group.yml \
|
||||
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
|
||||
|
||||
tag:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
build-tag: ${{ steps.build-tag.outputs.tag }}
|
||||
|
||||
steps:
|
||||
# Need `fetch-depth: 0` to count the number of commits in the branch
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get build tag
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
|
||||
echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
|
||||
echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
|
||||
fi
|
||||
id: build-tag
|
||||
meta:
|
||||
uses: ./.github/workflows/_meta.yml
|
||||
with:
|
||||
github-event-name: ${{ inputs.github-event-name || github.event_name }}
|
||||
|
||||
trigger-e2e-tests:
|
||||
needs: [ tag ]
|
||||
needs: [ meta ]
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
EVENT_ACTION: ${{ github.event.action }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
TAG: >-
|
||||
${{
|
||||
contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)
|
||||
&& needs.meta.outputs.previous-storage-release
|
||||
|| needs.meta.outputs.build-tag
|
||||
}}
|
||||
COMPUTE_TAG: >-
|
||||
${{
|
||||
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
|
||||
&& needs.meta.outputs.previous-compute-release
|
||||
|| needs.meta.outputs.build-tag
|
||||
}}
|
||||
steps:
|
||||
- name: Wait for `push-{neon,compute}-image-dev` job to finish
|
||||
# It's important to have a timeout here, the script in the step can run infinitely
|
||||
@@ -157,6 +144,6 @@ jobs:
|
||||
--raw-field "commit_hash=$COMMIT_SHA" \
|
||||
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
|
||||
--raw-field "storage_image_tag=${TAG}" \
|
||||
--raw-field "compute_image_tag=${TAG}" \
|
||||
--raw-field "compute_image_tag=${COMPUTE_TAG}" \
|
||||
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
|
||||
--raw-field "e2e-platforms=${E2E_PLATFORMS}"
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# Autoscaling
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
|
||||
# DevProd
|
||||
/.github/ @neondatabase/developer-productivity
|
||||
# DevProd & PerfCorr
|
||||
/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness
|
||||
|
||||
# Compute
|
||||
/pgxn/ @neondatabase/compute
|
||||
|
||||
347
Cargo.lock
generated
347
Cargo.lock
generated
@@ -783,6 +783,28 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-extra"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
|
||||
dependencies = [
|
||||
"axum",
|
||||
"axum-core",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"headers",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"serde",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "azure_core"
|
||||
version = "0.21.0"
|
||||
@@ -925,9 +947,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.21.1"
|
||||
version = "0.21.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
|
||||
checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
@@ -984,9 +1006,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.70.1"
|
||||
version = "0.71.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
|
||||
checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
"cexpr",
|
||||
@@ -997,7 +1019,7 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"rustc-hash 2.1.1",
|
||||
"shlex",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
@@ -1105,9 +1127,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.1.30"
|
||||
version = "1.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
|
||||
checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c"
|
||||
dependencies = [
|
||||
"jobserver",
|
||||
"libc",
|
||||
@@ -1305,6 +1327,7 @@ dependencies = [
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-types",
|
||||
"axum",
|
||||
"axum-extra",
|
||||
"base64 0.13.1",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -1316,6 +1339,7 @@ dependencies = [
|
||||
"flate2",
|
||||
"futures",
|
||||
"http 1.1.0",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"notify",
|
||||
@@ -1342,7 +1366,9 @@ dependencies = [
|
||||
"tokio-util",
|
||||
"tower 0.5.2",
|
||||
"tower-http",
|
||||
"tower-otel",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"url",
|
||||
@@ -2295,7 +2321,7 @@ name = "framed-websockets"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"bytemuck",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
@@ -2408,9 +2434,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
|
||||
|
||||
[[package]]
|
||||
name = "futures-timer"
|
||||
version = "3.0.2"
|
||||
version = "3.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
|
||||
checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
@@ -2513,6 +2539,27 @@ version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||
|
||||
[[package]]
|
||||
name = "governor"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"dashmap 6.1.0",
|
||||
"futures-sink",
|
||||
"futures-timer",
|
||||
"futures-util",
|
||||
"no-std-compat",
|
||||
"nonzero_ext",
|
||||
"parking_lot 0.12.1",
|
||||
"portable-atomic",
|
||||
"quanta",
|
||||
"rand 0.8.5",
|
||||
"smallvec",
|
||||
"spinning_top",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "group"
|
||||
version = "0.12.1"
|
||||
@@ -2630,7 +2677,7 @@ version = "7.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"byteorder",
|
||||
"crossbeam-channel",
|
||||
"flate2",
|
||||
@@ -2638,6 +2685,30 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "headers"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9"
|
||||
dependencies = [
|
||||
"base64 0.21.7",
|
||||
"bytes",
|
||||
"headers-core",
|
||||
"http 1.1.0",
|
||||
"httpdate",
|
||||
"mime",
|
||||
"sha1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "headers-core"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4"
|
||||
dependencies = [
|
||||
"http 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
@@ -2775,12 +2846,10 @@ name = "http-utils"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"backtrace",
|
||||
"bytes",
|
||||
"fail",
|
||||
"flate2",
|
||||
"futures",
|
||||
"hyper 0.14.30",
|
||||
"inferno 0.12.0",
|
||||
"itertools 0.10.5",
|
||||
"jemalloc_pprof",
|
||||
"metrics",
|
||||
@@ -2793,6 +2862,7 @@ dependencies = [
|
||||
"serde_path_to_error",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
@@ -3279,9 +3349,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
||||
|
||||
[[package]]
|
||||
name = "jemalloc_pprof"
|
||||
version = "0.6.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
|
||||
checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"libc",
|
||||
@@ -3365,7 +3435,7 @@ version = "9.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"js-sys",
|
||||
"pem",
|
||||
"ring",
|
||||
@@ -3480,9 +3550,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "mappings"
|
||||
version = "0.6.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
|
||||
checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"libc",
|
||||
@@ -3535,7 +3605,7 @@ dependencies = [
|
||||
"measured-derive",
|
||||
"memchr",
|
||||
"parking_lot 0.12.1",
|
||||
"rustc-hash",
|
||||
"rustc-hash 1.1.0",
|
||||
"ryu",
|
||||
]
|
||||
|
||||
@@ -3723,6 +3793,12 @@ dependencies = [
|
||||
"memoffset 0.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "no-std-compat"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "7.1.3"
|
||||
@@ -3733,6 +3809,12 @@ dependencies = [
|
||||
"minimal-lexical",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nonzero_ext"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
|
||||
|
||||
[[package]]
|
||||
name = "notify"
|
||||
version = "8.0.0"
|
||||
@@ -3982,7 +4064,7 @@ dependencies = [
|
||||
"opentelemetry-http",
|
||||
"opentelemetry-proto",
|
||||
"opentelemetry_sdk",
|
||||
"prost",
|
||||
"prost 0.13.3",
|
||||
"reqwest",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
@@ -3995,7 +4077,7 @@ checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
"prost",
|
||||
"prost 0.13.3",
|
||||
"tonic",
|
||||
]
|
||||
|
||||
@@ -4109,6 +4191,7 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
@@ -4198,6 +4281,9 @@ dependencies = [
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"rpds",
|
||||
"rustls 0.23.18",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls-pki-types",
|
||||
"scopeguard",
|
||||
"send-future",
|
||||
"serde",
|
||||
@@ -4216,6 +4302,7 @@ dependencies = [
|
||||
"tokio-epoll-uring",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
@@ -4223,6 +4310,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
"wal_decoder",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
@@ -4305,9 +4393,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "papaya"
|
||||
version = "0.1.8"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
|
||||
checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"seize",
|
||||
@@ -4435,7 +4523,7 @@ version = "3.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"serde",
|
||||
]
|
||||
|
||||
@@ -4484,18 +4572,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.1.0"
|
||||
version = "1.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead"
|
||||
checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d"
|
||||
dependencies = [
|
||||
"pin-project-internal",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-internal"
|
||||
version = "1.1.0"
|
||||
version = "1.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
|
||||
checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -4589,6 +4677,12 @@ dependencies = [
|
||||
"never-say-never",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
||||
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.7"
|
||||
@@ -4743,8 +4837,10 @@ dependencies = [
|
||||
"nix 0.26.4",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"protobuf",
|
||||
"protobuf-codegen-pure",
|
||||
"prost 0.12.6",
|
||||
"prost-build 0.12.6",
|
||||
"prost-derive 0.12.6",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"symbolic-demangle",
|
||||
"tempfile",
|
||||
@@ -4753,15 +4849,17 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pprof_util"
|
||||
version = "0.6.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
|
||||
checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"backtrace",
|
||||
"flate2",
|
||||
"inferno 0.12.0",
|
||||
"num",
|
||||
"paste",
|
||||
"prost",
|
||||
"prost 0.13.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4854,6 +4952,16 @@ dependencies = [
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost"
|
||||
version = "0.12.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost-derive 0.12.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost"
|
||||
version = "0.13.3"
|
||||
@@ -4861,7 +4969,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost-derive",
|
||||
"prost-derive 0.13.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-build"
|
||||
version = "0.12.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"itertools 0.10.5",
|
||||
"log",
|
||||
"multimap",
|
||||
"once_cell",
|
||||
"petgraph",
|
||||
"prettyplease",
|
||||
"prost 0.12.6",
|
||||
"prost-types 0.12.6",
|
||||
"regex",
|
||||
"syn 2.0.90",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4878,13 +5007,26 @@ dependencies = [
|
||||
"once_cell",
|
||||
"petgraph",
|
||||
"prettyplease",
|
||||
"prost",
|
||||
"prost-types",
|
||||
"prost 0.13.3",
|
||||
"prost-types 0.13.3",
|
||||
"regex",
|
||||
"syn 2.0.90",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-derive"
|
||||
version = "0.12.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools 0.10.5",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-derive"
|
||||
version = "0.13.3"
|
||||
@@ -4898,38 +5040,22 @@ dependencies = [
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-types"
|
||||
version = "0.12.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0"
|
||||
dependencies = [
|
||||
"prost 0.12.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-types"
|
||||
version = "0.13.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670"
|
||||
dependencies = [
|
||||
"prost",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf"
|
||||
version = "2.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-codegen"
|
||||
version = "2.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6"
|
||||
dependencies = [
|
||||
"protobuf",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-codegen-pure"
|
||||
version = "2.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865"
|
||||
dependencies = [
|
||||
"protobuf",
|
||||
"protobuf-codegen",
|
||||
"prost 0.13.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5010,7 +5136,7 @@ dependencies = [
|
||||
"reqwest-tracing",
|
||||
"rsa",
|
||||
"rstest",
|
||||
"rustc-hash",
|
||||
"rustc-hash 1.1.0",
|
||||
"rustls 0.23.18",
|
||||
"rustls-native-certs 0.8.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
@@ -5050,6 +5176,21 @@ dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quanta"
|
||||
version = "0.12.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"raw-cpuid",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
"web-sys",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.26.0"
|
||||
@@ -5180,6 +5321,15 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "raw-cpuid"
|
||||
version = "11.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.7.0"
|
||||
@@ -5514,16 +5664,16 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.6"
|
||||
version = "0.17.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "684d5e6e18f669ccebf64a92236bb7db9a34f07be010e3627368182027180866"
|
||||
checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"getrandom 0.2.11",
|
||||
"libc",
|
||||
"spin",
|
||||
"untrusted",
|
||||
"windows-sys 0.48.0",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5628,6 +5778,12 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.4.0"
|
||||
@@ -5744,7 +5900,7 @@ version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5753,15 +5909,15 @@ version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-pki-types"
|
||||
version = "1.10.0"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
|
||||
checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c"
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
@@ -5992,9 +6148,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "seize"
|
||||
version = "0.4.9"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
|
||||
checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
@@ -6387,6 +6543,15 @@ version = "0.9.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
||||
|
||||
[[package]]
|
||||
name = "spinning_top"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spki"
|
||||
version = "0.6.0"
|
||||
@@ -6438,7 +6603,7 @@ dependencies = [
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"prost",
|
||||
"prost 0.13.3",
|
||||
"rustls 0.23.18",
|
||||
"tokio",
|
||||
"tonic",
|
||||
@@ -6456,6 +6621,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"clashmap",
|
||||
"control_plane",
|
||||
"cron",
|
||||
"diesel",
|
||||
@@ -6463,6 +6629,7 @@ dependencies = [
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"governor",
|
||||
"hex",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
@@ -7209,7 +7376,7 @@ dependencies = [
|
||||
"hyper-util",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost",
|
||||
"prost 0.13.3",
|
||||
"rustls-native-certs 0.8.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"tokio",
|
||||
@@ -7229,8 +7396,8 @@ checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11"
|
||||
dependencies = [
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"prost-build",
|
||||
"prost-types",
|
||||
"prost-build 0.13.3",
|
||||
"prost-types 0.13.3",
|
||||
"quote",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
@@ -7277,10 +7444,12 @@ version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bitflags 2.8.0",
|
||||
"bytes",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -7294,6 +7463,20 @@ version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
|
||||
|
||||
[[package]]
|
||||
name = "tower-otel"
|
||||
version = "0.2.0"
|
||||
source = "git+https://github.com/mattiapenati/tower-otel?rev=56a7321053bcb72443888257b622ba0d43a11fcd#56a7321053bcb72443888257b622ba0d43a11fcd"
|
||||
dependencies = [
|
||||
"http 1.1.0",
|
||||
"opentelemetry",
|
||||
"pin-project",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-service"
|
||||
version = "0.3.3"
|
||||
@@ -7620,7 +7803,6 @@ dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"backtrace",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
@@ -7748,7 +7930,7 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
"prost",
|
||||
"prost 0.13.3",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -8174,7 +8356,7 @@ dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"base64 0.13.1",
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"base64ct",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -8205,6 +8387,7 @@ dependencies = [
|
||||
"hyper-util",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.0.1",
|
||||
"itertools 0.10.5",
|
||||
"itertools 0.12.1",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
@@ -8223,7 +8406,7 @@ dependencies = [
|
||||
"parquet",
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"prost",
|
||||
"prost 0.13.3",
|
||||
"quote",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
|
||||
19
Cargo.toml
19
Cargo.toml
@@ -43,7 +43,7 @@ members = [
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
edition = "2021"
|
||||
edition = "2024"
|
||||
license = "Apache-2.0"
|
||||
|
||||
## All dependency versions, used in the project
|
||||
@@ -53,7 +53,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
backtrace = "0.3.74"
|
||||
flate2 = "1.0.26"
|
||||
assert-json-diff = "2"
|
||||
async-stream = "0.3"
|
||||
@@ -68,9 +67,10 @@ aws-credential-types = "1.2.0"
|
||||
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
|
||||
aws-types = "1.3"
|
||||
axum = { version = "0.8.1", features = ["ws"] }
|
||||
axum-extra = { version = "0.10.0", features = ["typed-header"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.70"
|
||||
bindgen = "0.71"
|
||||
bit_field = "0.10.2"
|
||||
bstr = "1.0"
|
||||
byteorder = "1.4"
|
||||
@@ -95,6 +95,7 @@ futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
governor = "0.8"
|
||||
hashbrown = "0.14"
|
||||
hashlink = "0.9.1"
|
||||
hdrhistogram = "7.5.2"
|
||||
@@ -113,11 +114,10 @@ hyper-util = "0.1"
|
||||
tokio-tungstenite = "0.21.0"
|
||||
indexmap = "2"
|
||||
indoc = "2"
|
||||
inferno = "0.12.0"
|
||||
ipnet = "2.10.0"
|
||||
itertools = "0.10"
|
||||
itoa = "1.0.11"
|
||||
jemalloc_pprof = "0.6"
|
||||
jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
libc = "0.2"
|
||||
@@ -139,7 +139,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] }
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
|
||||
procfs = "0.16"
|
||||
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.13"
|
||||
@@ -155,6 +155,7 @@ rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
rustls = { version = "0.23.16", default-features = false }
|
||||
rustls-pemfile = "2"
|
||||
rustls-pki-types = "1.11"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sd-notify = "0.4.1"
|
||||
@@ -192,7 +193,11 @@ toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
|
||||
tower = { version = "0.5.2", default-features = false }
|
||||
tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
|
||||
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
|
||||
|
||||
# This revision uses opentelemetry 0.27. There's no tag for it.
|
||||
tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }
|
||||
|
||||
tower-service = "0.3.3"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2"
|
||||
|
||||
7
Makefile
7
Makefile
@@ -11,15 +11,16 @@ ICU_PREFIX_DIR := /usr/local/icu
|
||||
#
|
||||
BUILD_TYPE ?= debug
|
||||
WITH_SANITIZERS ?= no
|
||||
PG_CFLAGS = -fsigned-char
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
|
||||
PG_CFLAGS = -O2 -g3 $(CFLAGS)
|
||||
PG_CFLAGS += -O2 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 $(CFLAGS)
|
||||
PG_CFLAGS += -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
@@ -159,6 +160,8 @@ postgres-%: postgres-configure-% \
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
|
||||
+@echo "Compiling pageinspect $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
|
||||
+@echo "Compiling pg_trgm $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
|
||||
+@echo "Compiling amcheck $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
|
||||
+@echo "Compiling test_decoding $*"
|
||||
|
||||
@@ -162,7 +162,7 @@ FROM build-deps AS pg-build
|
||||
ARG PG_VERSION
|
||||
COPY vendor/postgres-${PG_VERSION:?} postgres
|
||||
RUN cd postgres && \
|
||||
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
--with-icu --with-libxml --with-libxslt --with-lz4" && \
|
||||
if [ "${PG_VERSION:?}" != "v14" ]; then \
|
||||
# zstd is available only from PG15
|
||||
@@ -1484,7 +1484,7 @@ WORKDIR /ext-src
|
||||
COPY compute/patches/pg_duckdb_v031.patch .
|
||||
COPY compute/patches/duckdb_v120.patch .
|
||||
# pg_duckdb build requires source dir to be a git repo to get submodules
|
||||
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
|
||||
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
|
||||
# - extension management function duckdb.install_extension()
|
||||
# - access to duckdb.extensions table and its sequence
|
||||
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
|
||||
@@ -1499,8 +1499,8 @@ ARG PG_VERSION
|
||||
COPY --from=pg_duckdb-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src/pg_duckdb-src
|
||||
RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
|
||||
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_repack"
|
||||
@@ -1758,15 +1758,15 @@ ARG TARGETARCH
|
||||
# test_runner/regress/test_compute_metrics.py
|
||||
# See comment on the top of the file regading `echo`, `-e` and `\n`
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then\
|
||||
postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
|
||||
postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
|
||||
pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
|
||||
sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
|
||||
else\
|
||||
postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\
|
||||
postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
|
||||
pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
|
||||
sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
|
||||
fi\
|
||||
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\
|
||||
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
|
||||
| tar xzf - --strip-components=1 -C.\
|
||||
&& curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
|
||||
| tar xzf - --strip-components=1 -C.\
|
||||
@@ -1933,6 +1933,7 @@ RUN apt update && \
|
||||
locales \
|
||||
procps \
|
||||
ca-certificates \
|
||||
rsyslog \
|
||||
$VERSION_INSTALLS && \
|
||||
apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
@@ -1978,6 +1979,13 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
|
||||
# Make the libraries we built available
|
||||
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
|
||||
# rsyslog config permissions
|
||||
# directory for rsyslogd pid file
|
||||
RUN mkdir /var/run/rsyslogd && \
|
||||
chown -R postgres:postgres /var/run/rsyslogd && \
|
||||
chown -R postgres:postgres /etc/rsyslog.d/
|
||||
|
||||
|
||||
ENV LANG=en_US.utf8
|
||||
USER postgres
|
||||
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
|
||||
import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
|
||||
import 'sql_exporter/lfc_cache_size_limit.libsonnet',
|
||||
import 'sql_exporter/lfc_chunk_size.libsonnet',
|
||||
import 'sql_exporter/lfc_hits.libsonnet',
|
||||
import 'sql_exporter/lfc_misses.libsonnet',
|
||||
import 'sql_exporter/lfc_used.libsonnet',
|
||||
|
||||
@@ -1 +1,5 @@
|
||||
SELECT sum(pg_database_size(datname)) AS total FROM pg_database;
|
||||
SELECT sum(pg_database_size(datname)) AS total
|
||||
FROM pg_database
|
||||
-- Ignore invalid databases, as we will likely have problems with
|
||||
-- getting their size from the Pageserver.
|
||||
WHERE datconnlimit != -2;
|
||||
|
||||
10
compute/etc/sql_exporter/lfc_chunk_size.libsonnet
Normal file
10
compute/etc/sql_exporter/lfc_chunk_size.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'lfc_chunk_size',
|
||||
type: 'gauge',
|
||||
help: 'LFC chunk size, measured in 8KiB pages',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'lfc_chunk_size_pages',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_chunk_size.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/lfc_chunk_size.sql
Normal file
1
compute/etc/sql_exporter/lfc_chunk_size.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages';
|
||||
@@ -1,10 +1,20 @@
|
||||
-- We export stats for 10 non-system databases. Without this limit it is too
|
||||
-- easy to abuse the system by creating lots of databases.
|
||||
|
||||
SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
|
||||
tup_updated AS updated, tup_deleted AS deleted, datname
|
||||
SELECT pg_database_size(datname) AS db_size,
|
||||
deadlocks,
|
||||
tup_inserted AS inserted,
|
||||
tup_updated AS updated,
|
||||
tup_deleted AS deleted,
|
||||
datname
|
||||
FROM pg_stat_database
|
||||
WHERE datname IN (
|
||||
SELECT datname FROM pg_database
|
||||
WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
|
||||
-- Ignore invalid databases, as we will likely have problems with
|
||||
-- getting their size from the Pageserver.
|
||||
WHERE datconnlimit != -2
|
||||
AND datname <> 'postgres'
|
||||
AND NOT datistemplate
|
||||
ORDER BY oid
|
||||
LIMIT 10
|
||||
);
|
||||
|
||||
@@ -39,17 +39,26 @@ commands:
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
|
||||
- name: rsyslogd
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
- filename: compute_ctl-sudoers
|
||||
content: |
|
||||
# Reverse hostname lookup doesn't currently work, and isn't needed anyway when all
|
||||
# the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to
|
||||
# resolve host" log messages that they generate.
|
||||
Defaults !fqdn
|
||||
|
||||
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
|
||||
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
|
||||
# regardless of hostname (ALL)
|
||||
#
|
||||
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
@@ -64,6 +73,12 @@ files:
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
|
||||
# compute_ctl will rewrite this file with the actual configuration, if needed.
|
||||
- filename: compute_rsyslog.conf
|
||||
content: |
|
||||
*.* /dev/null
|
||||
$IncludeConfig /etc/rsyslog.d/*.conf
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
@@ -127,6 +142,12 @@ merge: |
|
||||
RUN set -e \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
|
||||
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /var/log/
|
||||
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
|
||||
@@ -39,17 +39,26 @@ commands:
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
|
||||
- name: rsyslogd
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
- filename: compute_ctl-sudoers
|
||||
content: |
|
||||
# Reverse hostname lookup doesn't currently work, and isn't needed anyway when all
|
||||
# the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to
|
||||
# resolve host" log messages that they generate.
|
||||
Defaults !fqdn
|
||||
|
||||
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
|
||||
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
|
||||
# regardless of hostname (ALL)
|
||||
#
|
||||
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
@@ -64,6 +73,12 @@ files:
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
|
||||
# compute_ctl will rewrite this file with the actual configuration, if needed.
|
||||
- filename: compute_rsyslog.conf
|
||||
content: |
|
||||
*.* /dev/null
|
||||
$IncludeConfig /etc/rsyslog.d/*.conf
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
@@ -123,6 +138,11 @@ merge: |
|
||||
RUN set -e \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /var/log/
|
||||
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "compute_tools"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
edition = "2024"
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
@@ -17,6 +17,7 @@ aws-sdk-kms.workspace = true
|
||||
aws-smithy-types.workspace = true
|
||||
anyhow.workspace = true
|
||||
axum = { workspace = true, features = [] }
|
||||
axum-extra.workspace = true
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
cfg-if.workspace = true
|
||||
@@ -25,6 +26,7 @@ fail.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
http.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
metrics.workspace = true
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
@@ -46,7 +48,9 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tower-otel.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -33,41 +33,28 @@
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! ```
|
||||
use std::collections::HashMap;
|
||||
use std::ffi::OsString;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
use std::sync::mpsc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Parser;
|
||||
use compute_tools::disk_quota::set_disk_quota;
|
||||
use compute_tools::http::server::Server;
|
||||
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info, warn};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
|
||||
use compute_api::responses::ComputeCtlConfig;
|
||||
use compute_api::spec::ComputeSpec;
|
||||
|
||||
use compute_tools::compute::{
|
||||
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
|
||||
};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
|
||||
use compute_tools::extension_server::get_pg_version_string;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
use compute_tools::swap::resize_swap;
|
||||
use rlimit::{setrlimit, Resource};
|
||||
use rlimit::{Resource, setrlimit};
|
||||
use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM};
|
||||
use signal_hook::iterator::Signals;
|
||||
use tracing::{error, info};
|
||||
use url::Url;
|
||||
use utils::failpoint_support;
|
||||
|
||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||
@@ -149,6 +136,8 @@ struct Cli {
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// For historical reasons, the main thread that processes the spec and launches postgres
|
||||
// is synchronous, but we always have this tokio runtime available and we "enter" it so
|
||||
// that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
|
||||
@@ -160,34 +149,44 @@ fn main() -> Result<()> {
|
||||
|
||||
let build_tag = runtime.block_on(init())?;
|
||||
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// enable core dumping for all child processes
|
||||
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
|
||||
|
||||
let (pg_handle, start_pg_result) = {
|
||||
// Enter startup tracing context
|
||||
let _startup_context_guard = startup_context_from_env();
|
||||
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
|
||||
|
||||
let cli_spec = try_spec_from_cli(&cli)?;
|
||||
let cli_spec = try_spec_from_cli(&cli)?;
|
||||
|
||||
let compute = wait_spec(build_tag, &cli, cli_spec)?;
|
||||
let compute_node = ComputeNode::new(
|
||||
ComputeNodeParams {
|
||||
compute_id: cli.compute_id,
|
||||
connstr,
|
||||
pgdata: cli.pgdata.clone(),
|
||||
pgbin: cli.pgbin.clone(),
|
||||
pgversion: get_pg_version_string(&cli.pgbin),
|
||||
external_http_port: cli.external_http_port,
|
||||
internal_http_port: cli.internal_http_port,
|
||||
ext_remote_storage: cli.remote_ext_config.clone(),
|
||||
resize_swap_on_bind: cli.resize_swap_on_bind,
|
||||
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
|
||||
#[cfg(target_os = "linux")]
|
||||
filecache_connstr: cli.filecache_connstr,
|
||||
#[cfg(target_os = "linux")]
|
||||
cgroup: cli.cgroup,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor_addr: cli.vm_monitor_addr,
|
||||
build_tag,
|
||||
|
||||
start_postgres(&cli, compute)?
|
||||
live_config_allowed: cli_spec.live_config_allowed,
|
||||
},
|
||||
cli_spec.spec,
|
||||
cli_spec.compute_ctl_config,
|
||||
)?;
|
||||
|
||||
// Startup is finished, exit the startup tracing span
|
||||
};
|
||||
|
||||
// PostgreSQL is now running, if startup was successful. Wait until it exits.
|
||||
let wait_pg_result = wait_postgres(pg_handle)?;
|
||||
|
||||
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
|
||||
|
||||
maybe_delay_exit(delay_exit);
|
||||
let exit_code = compute_node.run()?;
|
||||
|
||||
scenario.teardown();
|
||||
|
||||
deinit_and_exit(wait_pg_result);
|
||||
deinit_and_exit(exit_code);
|
||||
}
|
||||
|
||||
async fn init() -> Result<String> {
|
||||
@@ -208,56 +207,6 @@ async fn init() -> Result<String> {
|
||||
Ok(build_tag)
|
||||
}
|
||||
|
||||
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
//
|
||||
// This is used to propagate the context for the 'start_compute' operation
|
||||
// from the neon control plane. This allows linking together the wider
|
||||
// 'start_compute' operation that creates the compute container, with the
|
||||
// startup actions here within the container.
|
||||
//
|
||||
// There is no standard for passing context in env variables, but a lot of
|
||||
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
|
||||
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
|
||||
//
|
||||
// Switch to the startup context here, and exit it once the startup has
|
||||
// completed and Postgres is up and running.
|
||||
//
|
||||
// If this pod is pre-created without binding it to any particular endpoint
|
||||
// yet, this isn't the right place to enter the startup context. In that
|
||||
// case, the control plane should pass the tracing context as part of the
|
||||
// /configure API call.
|
||||
//
|
||||
// NOTE: This is supposed to only cover the *startup* actions. Once
|
||||
// postgres is configured and up-and-running, we exit this span. Any other
|
||||
// actions that are performed on incoming HTTP requests, for example, are
|
||||
// performed in separate spans.
|
||||
//
|
||||
// XXX: If the pod is restarted, we perform the startup actions in the same
|
||||
// context as the original startup actions, which probably doesn't make
|
||||
// sense.
|
||||
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
|
||||
if let Ok(val) = std::env::var("TRACEPARENT") {
|
||||
startup_tracing_carrier.insert("traceparent".to_string(), val);
|
||||
}
|
||||
if let Ok(val) = std::env::var("TRACESTATE") {
|
||||
startup_tracing_carrier.insert("tracestate".to_string(), val);
|
||||
}
|
||||
if !startup_tracing_carrier.is_empty() {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry_sdk::propagation::TraceContextPropagator;
|
||||
let guard = TraceContextPropagator::new()
|
||||
.extract(&startup_tracing_carrier)
|
||||
.attach();
|
||||
info!("startup tracing context attached");
|
||||
Some(guard)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
if let Some(ref spec_json) = cli.spec_json {
|
||||
@@ -308,342 +257,7 @@ struct CliSpecParams {
|
||||
live_config_allowed: bool,
|
||||
}
|
||||
|
||||
fn wait_spec(
|
||||
build_tag: String,
|
||||
cli: &Cli,
|
||||
CliSpecParams {
|
||||
spec,
|
||||
live_config_allowed,
|
||||
compute_ctl_config: _,
|
||||
}: CliSpecParams,
|
||||
) -> Result<Arc<ComputeNode>> {
|
||||
let mut new_state = ComputeState::new();
|
||||
let spec_set;
|
||||
|
||||
if let Some(spec) = spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
info!("new pspec.spec: {:?}", pspec.spec);
|
||||
new_state.pspec = Some(pspec);
|
||||
spec_set = true;
|
||||
} else {
|
||||
spec_set = false;
|
||||
}
|
||||
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
|
||||
let conn_conf = postgres::config::Config::from_str(connstr.as_str())
|
||||
.context("cannot build postgres config from connstr")?;
|
||||
let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
|
||||
.context("cannot build tokio postgres config from connstr")?;
|
||||
let compute_node = ComputeNode {
|
||||
compute_id: cli.compute_id.clone(),
|
||||
connstr,
|
||||
conn_conf,
|
||||
tokio_conn_conf,
|
||||
pgdata: cli.pgdata.clone(),
|
||||
pgbin: cli.pgbin.clone(),
|
||||
pgversion: get_pg_version_string(&cli.pgbin),
|
||||
external_http_port: cli.external_http_port,
|
||||
internal_http_port: cli.internal_http_port,
|
||||
live_config_allowed,
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_remote_storage: cli.remote_ext_config.clone(),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
build_tag,
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
||||
// available for binding. Prewarming helps Postgres start quicker later,
|
||||
// because QEMU will already have its memory allocated from the host, and
|
||||
// the necessary binaries will already be cached.
|
||||
if !spec_set {
|
||||
compute.prewarm_postgres()?;
|
||||
}
|
||||
|
||||
// Launch the external HTTP server first, so that we can serve control plane
|
||||
// requests while configuration is still in progress.
|
||||
Server::External(cli.external_http_port).launch(&compute);
|
||||
|
||||
// The internal HTTP server could be launched later, but there isn't much
|
||||
// sense in waiting.
|
||||
Server::Internal(cli.internal_http_port).launch(&compute);
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
|
||||
if state.status == ComputeStatus::ConfigurationPending {
|
||||
info!("got spec, continue configuration");
|
||||
// Spec is already set by the http server handler.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Record for how long we slept waiting for the spec.
|
||||
let now = Utc::now();
|
||||
state.metrics.wait_for_spec_ms = now
|
||||
.signed_duration_since(state.start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
|
||||
// Reset start time, so that the total startup time that is calculated later will
|
||||
// not include the time that we waited for the spec.
|
||||
state.start_time = now;
|
||||
}
|
||||
|
||||
launch_lsn_lease_bg_task_for_static(&compute);
|
||||
|
||||
Ok(compute)
|
||||
}
|
||||
|
||||
fn start_postgres(
|
||||
cli: &Cli,
|
||||
compute: Arc<ComputeNode>,
|
||||
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
|
||||
// We got all we need, update the state.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.set_status(ComputeStatus::Init, &compute.state_changed);
|
||||
|
||||
info!(
|
||||
"running compute with features: {:?}",
|
||||
state.pspec.as_ref().unwrap().spec.features
|
||||
);
|
||||
// before we release the mutex, fetch some parameters for later.
|
||||
let &ComputeSpec {
|
||||
swap_size_bytes,
|
||||
disk_quota_bytes,
|
||||
#[cfg(target_os = "linux")]
|
||||
disable_lfc_resizing,
|
||||
..
|
||||
} = &state.pspec.as_ref().unwrap().spec;
|
||||
drop(state);
|
||||
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(&compute);
|
||||
let _configurator_handle = launch_configurator(&compute);
|
||||
|
||||
let mut prestartup_failed = false;
|
||||
let mut delay_exit = false;
|
||||
|
||||
// Resize swap to the desired size if the compute spec says so
|
||||
if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
|
||||
// To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
|
||||
// *before* starting postgres.
|
||||
//
|
||||
// In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
|
||||
// carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
|
||||
// OOM-killed during startup because swap wasn't available yet.
|
||||
match resize_swap(size_bytes) {
|
||||
Ok(()) => {
|
||||
let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%size_bytes, %size_mib, "resized swap");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to resize swap");
|
||||
error!("{err:#}");
|
||||
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set disk quota if the compute spec says so
|
||||
if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
|
||||
(disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
|
||||
{
|
||||
match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
|
||||
Ok(()) => {
|
||||
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%disk_quota_bytes, %size_mib, "set disk quota");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to set disk quota");
|
||||
error!("{err:#}");
|
||||
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start Postgres
|
||||
let mut pg = None;
|
||||
if !prestartup_failed {
|
||||
pg = match compute.start_compute() {
|
||||
Ok(pg) => {
|
||||
info!(postmaster_pid = %pg.0.id(), "Postgres was started");
|
||||
Some(pg)
|
||||
}
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:#}", err);
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
None
|
||||
}
|
||||
};
|
||||
} else {
|
||||
warn!("skipping postgres startup because pre-startup step failed");
|
||||
}
|
||||
|
||||
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
|
||||
// because it requires cgroups.
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
use std::env;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
let token = CancellationToken::new();
|
||||
|
||||
// don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
|
||||
let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
|
||||
None
|
||||
} else {
|
||||
Some(cli.filecache_connstr.clone())
|
||||
};
|
||||
|
||||
let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
|
||||
let vm_monitor = tokio::spawn(vm_monitor::start(
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: Some(cli.cgroup.clone()),
|
||||
pgconnstr,
|
||||
addr: cli.vm_monitor_addr.clone(),
|
||||
})),
|
||||
token.clone(),
|
||||
));
|
||||
Some(vm_monitor)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
pg,
|
||||
StartPostgresResult {
|
||||
delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
|
||||
|
||||
struct StartPostgresResult {
|
||||
delay_exit: bool,
|
||||
// passed through from WaitSpecResult
|
||||
compute: Arc<ComputeNode>,
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
token: tokio_util::sync::CancellationToken,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
|
||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||
// propagate to Postgres and it will be shut down as well.
|
||||
let mut exit_code = None;
|
||||
if let Some((mut pg, logs_handle)) = pg {
|
||||
info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
|
||||
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
PG_PID.store(0, Ordering::SeqCst);
|
||||
|
||||
// Process has exited. Wait for the log collecting task to finish.
|
||||
let _ = tokio::runtime::Handle::current()
|
||||
.block_on(logs_handle)
|
||||
.map_err(|e| tracing::error!("log task panicked: {:?}", e));
|
||||
|
||||
info!("Postgres exited with code {}, shutting down", ecode);
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
|
||||
Ok(WaitPostgresResult { exit_code })
|
||||
}
|
||||
|
||||
struct WaitPostgresResult {
|
||||
exit_code: Option<i32>,
|
||||
}
|
||||
|
||||
fn cleanup_after_postgres_exit(
|
||||
StartPostgresResult {
|
||||
mut delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
}: StartPostgresResult,
|
||||
) -> Result<bool> {
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
// /sys/fs/cgroup/neon-postgres.
|
||||
// Note: the vm-monitor only runs on linux because it requires cgroups.
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
if let Some(handle) = vm_monitor {
|
||||
// Kills all threads spawned by the monitor
|
||||
token.cancel();
|
||||
// Kills the actual task running the monitor
|
||||
handle.abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Maybe sync safekeepers again, to speed up next startup
|
||||
let compute_state = compute.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
|
||||
info!("syncing safekeepers on shutdown");
|
||||
let storage_auth_token = pspec.storage_auth_token.clone();
|
||||
let lsn = compute.sync_safekeepers(storage_auth_token)?;
|
||||
info!("synced safekeepers at lsn {lsn}");
|
||||
}
|
||||
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::TerminationPending {
|
||||
state.status = ComputeStatus::Terminated;
|
||||
compute.state_changed.notify_all();
|
||||
// we were asked to terminate gracefully, don't exit to avoid restart
|
||||
delay_exit = true
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if let Err(err) = compute.check_for_core_dumps() {
|
||||
error!("error while checking for core dumps: {err:?}");
|
||||
}
|
||||
|
||||
Ok(delay_exit)
|
||||
}
|
||||
|
||||
fn maybe_delay_exit(delay_exit: bool) {
|
||||
// If launch failed, keep serving HTTP requests for a while, so the cloud
|
||||
// control plane can get the actual error.
|
||||
if delay_exit {
|
||||
info!("giving control plane 30s to collect the error before shutdown");
|
||||
thread::sleep(Duration::from_secs(30));
|
||||
}
|
||||
}
|
||||
|
||||
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
|
||||
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit. Shutting down OTEL tracing provider may
|
||||
// hang for quite some time, see, for example:
|
||||
|
||||
@@ -25,13 +25,13 @@
|
||||
//! docker push localhost:3030/localregistry/compute-node-v14:latest
|
||||
//! ```
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use anyhow::{Context, bail};
|
||||
use aws_config::BehaviorVersion;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::{Parser, Subcommand};
|
||||
use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
|
||||
use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version};
|
||||
use nix::unistd::Pid;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use tracing::{Instrument, error, info, info_span, warn};
|
||||
use utils::fs_ext::is_directory_empty;
|
||||
|
||||
#[path = "fast_import/aws_s3_sync.rs"]
|
||||
@@ -558,7 +558,9 @@ async fn cmd_dumprestore(
|
||||
decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext)
|
||||
.await?
|
||||
} else {
|
||||
bail!("destination connection string must be provided in spec for dump_restore command");
|
||||
bail!(
|
||||
"destination connection string must be provided in spec for dump_restore command"
|
||||
);
|
||||
};
|
||||
|
||||
(source, dest)
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{info, warn};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use super::s3_uri::S3Uri;
|
||||
|
||||
use tracing::{info, warn};
|
||||
|
||||
const MAX_PARALLEL_UPLOADS: usize = 10;
|
||||
|
||||
/// Upload all files from 'local' to 'remote'
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use anyhow::Result;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
/// Struct to hold parsed S3 components
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct S3Uri {
|
||||
|
||||
@@ -1,18 +1,20 @@
|
||||
use std::path::Path;
|
||||
use std::process::Stdio;
|
||||
use std::result::Result;
|
||||
use std::sync::Arc;
|
||||
|
||||
use compute_api::responses::CatalogObjects;
|
||||
use futures::Stream;
|
||||
use postgres::NoTls;
|
||||
use std::{path::Path, process::Stdio, result::Result, sync::Arc};
|
||||
use tokio::{
|
||||
io::{AsyncBufReadExt, BufReader},
|
||||
process::Command,
|
||||
spawn,
|
||||
};
|
||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||
use tokio::process::Command;
|
||||
use tokio::spawn;
|
||||
use tokio_stream::{self as stream, StreamExt};
|
||||
use tokio_util::codec::{BytesCodec, FramedRead};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db};
|
||||
use compute_api::responses::CatalogObjects;
|
||||
|
||||
pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
|
||||
let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles"));
|
||||
@@ -55,15 +57,15 @@ pub enum SchemaDumpError {
|
||||
pub async fn get_database_schema(
|
||||
compute: &Arc<ComputeNode>,
|
||||
dbname: &str,
|
||||
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
|
||||
let pgbin = &compute.pgbin;
|
||||
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>> + use<>, SchemaDumpError> {
|
||||
let pgbin = &compute.params.pgbin;
|
||||
let basepath = Path::new(pgbin).parent().unwrap();
|
||||
let pgdump = basepath.join("pg_dump");
|
||||
|
||||
// Replace the DB in the connection string and disable it to parts.
|
||||
// This is the only option to handle DBs with special characters.
|
||||
let conf =
|
||||
postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
|
||||
let conf = postgres_conf_for_db(&compute.params.connstr, dbname)
|
||||
.map_err(|_| SchemaDumpError::Unexpected)?;
|
||||
let host = conf
|
||||
.get_hosts()
|
||||
.first()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use anyhow::{anyhow, Ok, Result};
|
||||
use anyhow::{Ok, Result, anyhow};
|
||||
use tokio_postgres::NoTls;
|
||||
use tracing::{error, instrument, warn};
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,13 +1,16 @@
|
||||
use anyhow::Result;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::prelude::*;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Result;
|
||||
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
|
||||
|
||||
use crate::pg_helpers::escape_conf_value;
|
||||
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
|
||||
use crate::pg_helpers::{
|
||||
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
|
||||
};
|
||||
|
||||
/// Check that `line` is inside a text file and put it there if it is not.
|
||||
/// Create file if it doesn't exist.
|
||||
@@ -56,10 +59,20 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
let mut neon_safekeepers_value = String::new();
|
||||
tracing::info!(
|
||||
"safekeepers_connstrings is not zero, gen: {:?}",
|
||||
spec.safekeepers_generation
|
||||
);
|
||||
// If generation is given, prepend sk list with g#number:
|
||||
if let Some(generation) = spec.safekeepers_generation {
|
||||
write!(neon_safekeepers_value, "g#{}:", generation)?;
|
||||
}
|
||||
neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
|
||||
writeln!(
|
||||
file,
|
||||
"neon.safekeepers={}",
|
||||
escape_conf_value(&spec.safekeeper_connstrings.join(","))
|
||||
escape_conf_value(&neon_safekeepers_value)
|
||||
)?;
|
||||
}
|
||||
if let Some(s) = &spec.tenant_id {
|
||||
@@ -127,6 +140,54 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
}
|
||||
|
||||
// If audit logging is enabled, configure pgaudit.
|
||||
//
|
||||
// Note, that this is called after the settings from spec are written.
|
||||
// This way we always override the settings from the spec
|
||||
// and don't allow the user or the control plane admin to change them.
|
||||
if let ComputeAudit::Hipaa = spec.audit_log_level {
|
||||
writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
|
||||
// This log level is very verbose
|
||||
// but this is necessary for HIPAA compliance.
|
||||
writeln!(file, "pgaudit.log='all'")?;
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
// Disable logging of catalog queries
|
||||
// The catalog doesn't contain sensitive data, so we don't need to audit it.
|
||||
writeln!(file, "pgaudit.log_catalog=off")?;
|
||||
// Set log rotation to 5 minutes
|
||||
// TODO: tune this after performance testing
|
||||
writeln!(file, "pgaudit.log_rotation_age=5")?;
|
||||
|
||||
// Add audit shared_preload_libraries, if they are not present.
|
||||
//
|
||||
// The caller who sets the flag is responsible for ensuring that the necessary
|
||||
// shared_preload_libraries are present in the compute image,
|
||||
// otherwise the compute start will fail.
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
let mut extra_shared_preload_libraries = String::new();
|
||||
if !libs.contains("pgaudit") {
|
||||
extra_shared_preload_libraries.push_str(",pgaudit");
|
||||
}
|
||||
if !libs.contains("pgauditlogtofile") {
|
||||
extra_shared_preload_libraries.push_str(",pgauditlogtofile");
|
||||
}
|
||||
writeln!(
|
||||
file,
|
||||
"shared_preload_libraries='{}{}'",
|
||||
libs, extra_shared_preload_libraries
|
||||
)?;
|
||||
} else {
|
||||
// Typically, this should be unreacheable,
|
||||
// because we always set at least some shared_preload_libraries in the spec
|
||||
// but let's handle it explicitly anyway.
|
||||
writeln!(
|
||||
file,
|
||||
"shared_preload_libraries='neon,pgaudit,pgauditlogtofile'"
|
||||
)?;
|
||||
}
|
||||
writeln!(file, "# Managed by compute_ctl audit settings: end")?;
|
||||
}
|
||||
|
||||
writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
|
||||
|
||||
if spec.drop_subscriptions_before_start {
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
# Load imfile module to read log files
|
||||
module(load="imfile")
|
||||
|
||||
# Input configuration for log files in the specified directory
|
||||
# Replace {log_directory} with the directory containing the log files
|
||||
input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
|
||||
global(workDirectory="/var/log")
|
||||
|
||||
# Forward logs to remote syslog server
|
||||
*.* @@{remote_endpoint}
|
||||
@@ -1,9 +1,8 @@
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use tracing::{error, info, instrument};
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use tracing::{error, info, instrument};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use anyhow::Context;
|
||||
use tracing::instrument;
|
||||
|
||||
pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota";
|
||||
|
||||
/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
|
||||
/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set.
|
||||
#[instrument]
|
||||
pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> {
|
||||
let size_kb = size_bytes / 1024;
|
||||
// run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`
|
||||
|
||||
@@ -71,15 +71,15 @@ More specifically, here is an example ext_index.json
|
||||
}
|
||||
}
|
||||
*/
|
||||
use anyhow::Result;
|
||||
use anyhow::{bail, Context};
|
||||
use std::path::Path;
|
||||
use std::str;
|
||||
|
||||
use anyhow::{Context, Result, bail};
|
||||
use bytes::Bytes;
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use regex::Regex;
|
||||
use remote_storage::*;
|
||||
use reqwest::StatusCode;
|
||||
use std::path::Path;
|
||||
use std::str;
|
||||
use tar::Archive;
|
||||
use tracing::info;
|
||||
use tracing::log::warn;
|
||||
@@ -202,8 +202,24 @@ pub async fn download_extension(
|
||||
// move contents of the libdir / sharedir in unzipped archive to the correct local paths
|
||||
for paths in [sharedir_paths, libdir_paths] {
|
||||
let (zip_dir, real_dir) = paths;
|
||||
|
||||
let dir = match std::fs::read_dir(&zip_dir) {
|
||||
Ok(dir) => dir,
|
||||
Err(e) => match e.kind() {
|
||||
// In the event of a SQL-only extension, there would be nothing
|
||||
// to move from the lib/ directory, so note that in the log and
|
||||
// move on.
|
||||
std::io::ErrorKind::NotFound => {
|
||||
info!("nothing to move from {}", zip_dir);
|
||||
continue;
|
||||
}
|
||||
_ => return Err(anyhow::anyhow!(e)),
|
||||
},
|
||||
};
|
||||
|
||||
info!("mv {zip_dir:?}/* {real_dir:?}");
|
||||
for file in std::fs::read_dir(zip_dir)? {
|
||||
|
||||
for file in dir {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
|
||||
@@ -244,33 +260,40 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
||||
info!("writing file {:?}{:?}", control_path, control_content);
|
||||
std::fs::write(control_path, control_content).unwrap();
|
||||
} else {
|
||||
warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path);
|
||||
warn!(
|
||||
"control file {:?} exists both locally and remotely. ignoring the remote version.",
|
||||
control_path
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Do request to extension storage proxy, i.e.
|
||||
// Do request to extension storage proxy, e.g.,
|
||||
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
|
||||
// using HHTP GET
|
||||
// and return the response body as bytes
|
||||
//
|
||||
// using HTTP GET and return the response body as bytes.
|
||||
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = format!("{}/{}", ext_remote_storage, ext_path);
|
||||
let filename = Path::new(ext_path)
|
||||
.file_name()
|
||||
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
|
||||
.to_str()
|
||||
.unwrap_or("unknown")
|
||||
.to_string();
|
||||
|
||||
info!("Download extension {} from uri {}", ext_path, uri);
|
||||
info!("Downloading extension file '{}' from uri {}", filename, uri);
|
||||
|
||||
match do_extension_server_request(&uri).await {
|
||||
Ok(resp) => {
|
||||
info!("Successfully downloaded remote extension data {}", ext_path);
|
||||
REMOTE_EXT_REQUESTS_TOTAL
|
||||
.with_label_values(&[&StatusCode::OK.to_string()])
|
||||
.with_label_values(&[&StatusCode::OK.to_string(), &filename])
|
||||
.inc();
|
||||
Ok(resp)
|
||||
}
|
||||
Err((msg, status)) => {
|
||||
REMOTE_EXT_REQUESTS_TOTAL
|
||||
.with_label_values(&[&status])
|
||||
.with_label_values(&[&status, &filename])
|
||||
.inc();
|
||||
bail!(msg);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use axum::extract::{rejection::JsonRejection, FromRequest, Request};
|
||||
use axum::extract::rejection::JsonRejection;
|
||||
use axum::extract::{FromRequest, Request};
|
||||
use compute_api::responses::GenericAPIError;
|
||||
use http::StatusCode;
|
||||
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
pub(crate) mod json;
|
||||
pub(crate) mod path;
|
||||
pub(crate) mod query;
|
||||
pub(crate) mod request_id;
|
||||
|
||||
pub(crate) use json::Json;
|
||||
pub(crate) use path::Path;
|
||||
pub(crate) use query::Query;
|
||||
pub(crate) use request_id::RequestId;
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use axum::extract::{rejection::PathRejection, FromRequestParts};
|
||||
use axum::extract::FromRequestParts;
|
||||
use axum::extract::rejection::PathRejection;
|
||||
use compute_api::responses::GenericAPIError;
|
||||
use http::{request::Parts, StatusCode};
|
||||
use http::StatusCode;
|
||||
use http::request::Parts;
|
||||
|
||||
/// Custom `Path` extractor, so that we can format errors into
|
||||
/// `JsonResponse<GenericAPIError>`.
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use axum::extract::{rejection::QueryRejection, FromRequestParts};
|
||||
use axum::extract::FromRequestParts;
|
||||
use axum::extract::rejection::QueryRejection;
|
||||
use compute_api::responses::GenericAPIError;
|
||||
use http::{request::Parts, StatusCode};
|
||||
use http::StatusCode;
|
||||
use http::request::Parts;
|
||||
|
||||
/// Custom `Query` extractor, so that we can format errors into
|
||||
/// `JsonResponse<GenericAPIError>`.
|
||||
|
||||
86
compute_tools/src/http/extract/request_id.rs
Normal file
86
compute_tools/src/http/extract/request_id.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
use std::{
|
||||
fmt::Display,
|
||||
ops::{Deref, DerefMut},
|
||||
};
|
||||
|
||||
use axum::{extract::FromRequestParts, response::IntoResponse};
|
||||
use http::{StatusCode, request::Parts};
|
||||
|
||||
use crate::http::{JsonResponse, headers::X_REQUEST_ID};
|
||||
|
||||
/// Extract the request ID from the `X-Request-Id` header.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub(crate) struct RequestId(pub String);
|
||||
|
||||
#[derive(Debug)]
|
||||
/// Rejection used for [`RequestId`].
|
||||
///
|
||||
/// Contains one variant for each way the [`RequestId`] extractor can
|
||||
/// fail.
|
||||
pub(crate) enum RequestIdRejection {
|
||||
/// The request is missing the header.
|
||||
MissingRequestId,
|
||||
|
||||
/// The value of the header is invalid UTF-8.
|
||||
InvalidUtf8,
|
||||
}
|
||||
|
||||
impl RequestIdRejection {
|
||||
pub fn status(&self) -> StatusCode {
|
||||
match self {
|
||||
RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn message(&self) -> String {
|
||||
match self {
|
||||
RequestIdRejection::MissingRequestId => "request ID is missing",
|
||||
RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoResponse for RequestIdRejection {
|
||||
fn into_response(self) -> axum::response::Response {
|
||||
JsonResponse::error(self.status(), self.message())
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> FromRequestParts<S> for RequestId
|
||||
where
|
||||
S: Send + Sync,
|
||||
{
|
||||
type Rejection = RequestIdRejection;
|
||||
|
||||
async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
|
||||
match parts.headers.get(X_REQUEST_ID) {
|
||||
Some(value) => match value.to_str() {
|
||||
Ok(request_id) => Ok(Self(request_id.to_string())),
|
||||
Err(_) => Err(RequestIdRejection::InvalidUtf8),
|
||||
},
|
||||
None => Err(RequestIdRejection::MissingRequestId),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for RequestId {
|
||||
type Target = String;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for RequestId {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for RequestId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(&self.0)
|
||||
}
|
||||
}
|
||||
2
compute_tools/src/http/headers.rs
Normal file
2
compute_tools/src/http/headers.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
/// Constant for `X-Request-Id` header.
|
||||
pub const X_REQUEST_ID: &str = "x-request-id";
|
||||
145
compute_tools/src/http/middleware/authorize.rs
Normal file
145
compute_tools/src/http/middleware/authorize.rs
Normal file
@@ -0,0 +1,145 @@
|
||||
use std::{collections::HashSet, net::SocketAddr};
|
||||
|
||||
use anyhow::{Result, anyhow};
|
||||
use axum::{RequestExt, body::Body, extract::ConnectInfo};
|
||||
use axum_extra::{
|
||||
TypedHeader,
|
||||
headers::{Authorization, authorization::Bearer},
|
||||
};
|
||||
use futures::future::BoxFuture;
|
||||
use http::{Request, Response, StatusCode};
|
||||
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
|
||||
use serde::Deserialize;
|
||||
use tower_http::auth::AsyncAuthorizeRequest;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::http::{JsonResponse, extract::RequestId};
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub(in crate::http) struct Claims {
|
||||
compute_id: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(in crate::http) struct Authorize {
|
||||
compute_id: String,
|
||||
jwks: JwkSet,
|
||||
validation: Validation,
|
||||
}
|
||||
|
||||
impl Authorize {
|
||||
pub fn new(compute_id: String, jwks: JwkSet) -> Self {
|
||||
let mut validation = Validation::new(Algorithm::EdDSA);
|
||||
// Nothing is currently required
|
||||
validation.required_spec_claims = HashSet::new();
|
||||
validation.validate_exp = true;
|
||||
// Unused by the control plane
|
||||
validation.validate_aud = false;
|
||||
// Unused by the control plane
|
||||
validation.validate_nbf = false;
|
||||
|
||||
Self {
|
||||
compute_id,
|
||||
jwks,
|
||||
validation,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
type RequestBody = Body;
|
||||
type ResponseBody = Body;
|
||||
type Future = BoxFuture<'static, Result<Request<Body>, Response<Self::ResponseBody>>>;
|
||||
|
||||
fn authorize(&mut self, mut request: Request<Body>) -> Self::Future {
|
||||
let compute_id = self.compute_id.clone();
|
||||
let jwks = self.jwks.clone();
|
||||
let validation = self.validation.clone();
|
||||
|
||||
Box::pin(async move {
|
||||
let request_id = request.extract_parts::<RequestId>().await.unwrap();
|
||||
|
||||
// TODO: Remove this check after a successful rollout
|
||||
if jwks.keys.is_empty() {
|
||||
warn!(%request_id, "Authorization has not been configured");
|
||||
|
||||
return Ok(request);
|
||||
}
|
||||
|
||||
let connect_info = request
|
||||
.extract_parts::<ConnectInfo<SocketAddr>>()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// In the event the request is coming from the loopback interface,
|
||||
// allow all requests
|
||||
if connect_info.ip().is_loopback() {
|
||||
warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
|
||||
|
||||
return Ok(request);
|
||||
}
|
||||
|
||||
let TypedHeader(Authorization(bearer)) = request
|
||||
.extract_parts::<TypedHeader<Authorization<Bearer>>>()
|
||||
.await
|
||||
.map_err(|_| {
|
||||
JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token")
|
||||
})?;
|
||||
|
||||
let data = match Self::verify(&jwks, bearer.token(), &validation) {
|
||||
Ok(claims) => claims,
|
||||
Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
|
||||
};
|
||||
|
||||
if data.claims.compute_id != compute_id {
|
||||
return Err(JsonResponse::error(
|
||||
StatusCode::UNAUTHORIZED,
|
||||
"invalid claims in authorization token",
|
||||
));
|
||||
}
|
||||
|
||||
// Make claims available to any subsequent middleware or request
|
||||
// handlers
|
||||
request.extensions_mut().insert(data.claims);
|
||||
|
||||
Ok(request)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Authorize {
|
||||
/// Verify the token using the JSON Web Key set and return the token data.
|
||||
fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
|
||||
debug_assert!(!jwks.keys.is_empty());
|
||||
|
||||
for jwk in jwks.keys.iter() {
|
||||
let decoding_key = match DecodingKey::from_jwk(jwk) {
|
||||
Ok(key) => key,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to construct decoding key from {}: {}",
|
||||
jwk.common.key_id.as_ref().unwrap(),
|
||||
e
|
||||
);
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
|
||||
Ok(data) => return Ok(data),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to decode authorization token using {}: {}",
|
||||
jwk.common.key_id.as_ref().unwrap(),
|
||||
e
|
||||
);
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow!("Failed to verify authorization token"))
|
||||
}
|
||||
}
|
||||
1
compute_tools/src/http/middleware/mod.rs
Normal file
1
compute_tools/src/http/middleware/mod.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub(in crate::http) mod authorize;
|
||||
@@ -1,10 +1,14 @@
|
||||
use axum::{body::Body, response::Response};
|
||||
use axum::body::Body;
|
||||
use axum::response::Response;
|
||||
use compute_api::responses::{ComputeStatus, GenericAPIError};
|
||||
use http::{header::CONTENT_TYPE, StatusCode};
|
||||
use http::StatusCode;
|
||||
use http::header::CONTENT_TYPE;
|
||||
use serde::Serialize;
|
||||
use tracing::error;
|
||||
|
||||
mod extract;
|
||||
mod headers;
|
||||
mod middleware;
|
||||
mod routes;
|
||||
pub mod server;
|
||||
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse};
|
||||
use crate::checker::check_writability;
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::http::JsonResponse;
|
||||
|
||||
/// Check that the compute is currently running.
|
||||
pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
|
||||
@@ -1,18 +1,16 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use compute_api::{
|
||||
requests::ConfigurationRequest,
|
||||
responses::{ComputeStatus, ComputeStatusResponse},
|
||||
};
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
|
||||
use http::StatusCode;
|
||||
use tokio::task;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
compute::{ComputeNode, ParsedSpec},
|
||||
http::{extract::Json, JsonResponse},
|
||||
};
|
||||
use crate::compute::{ComputeNode, ParsedSpec};
|
||||
use crate::http::JsonResponse;
|
||||
use crate::http::extract::Json;
|
||||
|
||||
// Accept spec in JSON format and request compute configuration. If anything
|
||||
// goes wrong after we set the compute status to `ConfigurationPending` and
|
||||
@@ -24,7 +22,7 @@ pub(in crate::http) async fn configure(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigurationRequest>,
|
||||
) -> Response {
|
||||
if !compute.live_config_allowed {
|
||||
if !compute.params.live_config_allowed {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"live configuration is not allowed for this compute node".to_string(),
|
||||
@@ -47,13 +45,18 @@ pub(in crate::http) async fn configure(
|
||||
return JsonResponse::invalid_status(state.status);
|
||||
}
|
||||
|
||||
// Pass the tracing span to the main thread that performs the startup,
|
||||
// so that the start_compute operation is considered a child of this
|
||||
// configure request for tracing purposes.
|
||||
state.startup_span = Some(tracing::Span::current());
|
||||
|
||||
state.pspec = Some(pspec);
|
||||
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
|
||||
drop(state);
|
||||
}
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Running. This is
|
||||
// needed to do not block the main pool of workers and be able to serve
|
||||
// needed to not block the main pool of workers and to be able to serve
|
||||
// other requests while some particular request is waiting for compute to
|
||||
// finish configuration.
|
||||
let c = compute.clone();
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{body::Body, extract::State, response::Response};
|
||||
use http::{header::CONTENT_TYPE, StatusCode};
|
||||
use axum::body::Body;
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use http::StatusCode;
|
||||
use http::header::CONTENT_TYPE;
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::{
|
||||
catalog::{get_database_schema, SchemaDumpError},
|
||||
compute::ComputeNode,
|
||||
http::{extract::Query, JsonResponse},
|
||||
};
|
||||
use crate::catalog::{SchemaDumpError, get_database_schema};
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::http::JsonResponse;
|
||||
use crate::http::extract::Query;
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub(in crate::http) struct DatabaseSchemaParams {
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse};
|
||||
use crate::catalog::get_dbs_and_roles;
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::http::JsonResponse;
|
||||
|
||||
/// Get the databases and roles from the compute.
|
||||
pub(in crate::http) async fn get_catalog_objects(
|
||||
|
||||
@@ -1,19 +1,13 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{
|
||||
extract::State,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use axum::extract::State;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use http::StatusCode;
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::{
|
||||
compute::ComputeNode,
|
||||
http::{
|
||||
extract::{Path, Query},
|
||||
JsonResponse,
|
||||
},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::http::JsonResponse;
|
||||
use crate::http::extract::{Path, Query};
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub(in crate::http) struct ExtensionServerParams {
|
||||
@@ -24,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams {
|
||||
/// Download a remote extension.
|
||||
pub(in crate::http) async fn download_extension(
|
||||
Path(filename): Path<String>,
|
||||
params: Query<ExtensionServerParams>,
|
||||
ext_server_params: Query<ExtensionServerParams>,
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
) -> Response {
|
||||
// Don't even try to download extensions if no remote storage is configured
|
||||
if compute.ext_remote_storage.is_none() {
|
||||
if compute.params.ext_remote_storage.is_none() {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"remote storage is not configured",
|
||||
@@ -52,9 +46,9 @@ pub(in crate::http) async fn download_extension(
|
||||
|
||||
remote_extensions.get_ext(
|
||||
&filename,
|
||||
params.is_library,
|
||||
&compute.build_tag,
|
||||
&compute.pgversion,
|
||||
ext_server_params.is_library,
|
||||
&compute.params.build_tag,
|
||||
&compute.params.pgversion,
|
||||
)
|
||||
};
|
||||
|
||||
|
||||
@@ -1,16 +1,14 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use compute_api::{
|
||||
requests::ExtensionInstallRequest,
|
||||
responses::{ComputeStatus, ExtensionInstallResponse},
|
||||
};
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use compute_api::requests::ExtensionInstallRequest;
|
||||
use compute_api::responses::{ComputeStatus, ExtensionInstallResponse};
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{
|
||||
compute::ComputeNode,
|
||||
http::{extract::Json, JsonResponse},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::http::JsonResponse;
|
||||
use crate::http::extract::Json;
|
||||
|
||||
/// Install a extension.
|
||||
pub(in crate::http) async fn install_extension(
|
||||
|
||||
@@ -17,7 +17,8 @@ pub struct FailpointConfig {
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
use crate::http::{extract::Json, JsonResponse};
|
||||
use crate::http::JsonResponse;
|
||||
use crate::http::extract::Json;
|
||||
|
||||
/// Configure failpoints for testing purposes.
|
||||
pub(in crate::http) async fn configure_failpoints(
|
||||
|
||||
@@ -1,16 +1,14 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use compute_api::{
|
||||
requests::SetRoleGrantsRequest,
|
||||
responses::{ComputeStatus, SetRoleGrantsResponse},
|
||||
};
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use compute_api::requests::SetRoleGrantsRequest;
|
||||
use compute_api::responses::{ComputeStatus, SetRoleGrantsResponse};
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{
|
||||
compute::ComputeNode,
|
||||
http::{extract::Json, JsonResponse},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::http::JsonResponse;
|
||||
use crate::http::extract::Json;
|
||||
|
||||
/// Add grants for a role.
|
||||
pub(in crate::http) async fn add_grant(
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{compute::ComputeNode, http::JsonResponse};
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::http::JsonResponse;
|
||||
|
||||
/// Collect current Postgres usage insights.
|
||||
pub(in crate::http) async fn get_insights(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
use axum::{body::Body, response::Response};
|
||||
use http::header::CONTENT_TYPE;
|
||||
use axum::body::Body;
|
||||
use axum::response::Response;
|
||||
use http::StatusCode;
|
||||
use http::header::CONTENT_TYPE;
|
||||
use metrics::proto::MetricFamily;
|
||||
use metrics::{Encoder, TextEncoder};
|
||||
|
||||
use crate::{http::JsonResponse, metrics::collect};
|
||||
use crate::http::JsonResponse;
|
||||
use crate::metrics::collect;
|
||||
|
||||
/// Expose Prometheus metrics.
|
||||
pub(in crate::http) async fn get_metrics() -> Response {
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{compute::ComputeNode, http::JsonResponse};
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::http::JsonResponse;
|
||||
|
||||
/// Get startup metrics.
|
||||
pub(in crate::http) async fn get_metrics(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
use std::{ops::Deref, sync::Arc};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, http::StatusCode, response::Response};
|
||||
use axum::extract::State;
|
||||
use axum::http::StatusCode;
|
||||
use axum::response::Response;
|
||||
use compute_api::responses::ComputeStatusResponse;
|
||||
|
||||
use crate::{compute::ComputeNode, http::JsonResponse};
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::http::JsonResponse;
|
||||
|
||||
/// Retrieve the state of the comute.
|
||||
pub(in crate::http) async fn get_status(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
|
||||
@@ -1,18 +1,14 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{
|
||||
extract::State,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use axum::extract::State;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use http::StatusCode;
|
||||
use tokio::task;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
compute::{forward_termination_signal, ComputeNode},
|
||||
http::JsonResponse,
|
||||
};
|
||||
use crate::compute::{ComputeNode, forward_termination_signal};
|
||||
use crate::http::JsonResponse;
|
||||
|
||||
/// Terminate the compute.
|
||||
pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
|
||||
@@ -1,60 +1,67 @@
|
||||
use std::{
|
||||
fmt::Display,
|
||||
net::{IpAddr, Ipv6Addr, SocketAddr},
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
use std::fmt::Display;
|
||||
use std::net::{IpAddr, Ipv6Addr, SocketAddr};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Result;
|
||||
use axum::{
|
||||
extract::Request,
|
||||
middleware::{self, Next},
|
||||
response::{IntoResponse, Response},
|
||||
routing::{get, post},
|
||||
Router,
|
||||
};
|
||||
use axum::Router;
|
||||
use axum::extract::Request;
|
||||
use axum::middleware::{self, Next};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::routing::{get, post};
|
||||
use http::StatusCode;
|
||||
use jsonwebtoken::jwk::JwkSet;
|
||||
use tokio::net::TcpListener;
|
||||
use tower::ServiceBuilder;
|
||||
use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer};
|
||||
use tracing::{debug, error, info, Span};
|
||||
use tower_http::{
|
||||
auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
|
||||
};
|
||||
use tracing::{Span, error, info};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, insights, metrics, metrics_json, status, terminate,
|
||||
use super::{
|
||||
headers::X_REQUEST_ID,
|
||||
middleware::authorize::Authorize,
|
||||
routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, insights, metrics, metrics_json, status, terminate,
|
||||
},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
const X_REQUEST_ID: &str = "x-request-id";
|
||||
|
||||
/// `compute_ctl` has two servers: internal and external. The internal server
|
||||
/// binds to the loopback interface and handles communication from clients on
|
||||
/// the compute. The external server is what receives communication from the
|
||||
/// control plane, the metrics scraper, etc. We make the distinction because
|
||||
/// certain routes in `compute_ctl` only need to be exposed to local processes
|
||||
/// like Postgres via the neon extension and local_proxy.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum Server {
|
||||
Internal(u16),
|
||||
External(u16),
|
||||
Internal {
|
||||
port: u16,
|
||||
},
|
||||
External {
|
||||
port: u16,
|
||||
jwks: JwkSet,
|
||||
compute_id: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl Display for Server {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Server::Internal(_) => f.write_str("internal"),
|
||||
Server::External(_) => f.write_str("external"),
|
||||
Server::Internal { .. } => f.write_str("internal"),
|
||||
Server::External { .. } => f.write_str("external"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Server> for Router<Arc<ComputeNode>> {
|
||||
fn from(server: Server) -> Self {
|
||||
impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
fn from(server: &Server) -> Self {
|
||||
let mut router = Router::<Arc<ComputeNode>>::new();
|
||||
|
||||
router = match server {
|
||||
Server::Internal(_) => {
|
||||
Server::Internal { .. } => {
|
||||
router = router
|
||||
.route(
|
||||
"/extension_server/{*filename}",
|
||||
@@ -72,58 +79,71 @@ impl From<Server> for Router<Arc<ComputeNode>> {
|
||||
|
||||
router
|
||||
}
|
||||
Server::External(_) => router
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
|
||||
.route("/insights", get(insights::get_insights))
|
||||
.route("/metrics", get(metrics::get_metrics))
|
||||
.route("/metrics.json", get(metrics_json::get_metrics))
|
||||
.route("/status", get(status::get_status))
|
||||
.route("/terminate", post(terminate::terminate)),
|
||||
Server::External {
|
||||
jwks, compute_id, ..
|
||||
} => {
|
||||
let unauthenticated_router =
|
||||
Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
|
||||
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
|
||||
.route("/insights", get(insights::get_insights))
|
||||
.route("/metrics.json", get(metrics_json::get_metrics))
|
||||
.route("/status", get(status::get_status))
|
||||
.route("/terminate", post(terminate::terminate))
|
||||
.layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
|
||||
compute_id.clone(),
|
||||
jwks.clone(),
|
||||
)));
|
||||
|
||||
router
|
||||
.merge(unauthenticated_router)
|
||||
.merge(authenticated_router)
|
||||
}
|
||||
};
|
||||
|
||||
router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
|
||||
ServiceBuilder::new()
|
||||
// Add this middleware since we assume the request ID exists
|
||||
.layer(middleware::from_fn(maybe_add_request_id_header))
|
||||
.layer(
|
||||
TraceLayer::new_for_http()
|
||||
.on_request(|request: &http::Request<_>, _span: &Span| {
|
||||
let request_id = request
|
||||
.headers()
|
||||
.get(X_REQUEST_ID)
|
||||
.unwrap()
|
||||
.to_str()
|
||||
.unwrap();
|
||||
|
||||
match request.uri().path() {
|
||||
"/metrics" => {
|
||||
debug!(%request_id, "{} {}", request.method(), request.uri())
|
||||
}
|
||||
_ => info!(%request_id, "{} {}", request.method(), request.uri()),
|
||||
};
|
||||
})
|
||||
.on_response(
|
||||
|response: &http::Response<_>, latency: Duration, _span: &Span| {
|
||||
let request_id = response
|
||||
router
|
||||
.fallback(Server::handle_404)
|
||||
.method_not_allowed_fallback(Server::handle_405)
|
||||
.layer(
|
||||
ServiceBuilder::new()
|
||||
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
|
||||
// Add this middleware since we assume the request ID exists
|
||||
.layer(middleware::from_fn(maybe_add_request_id_header))
|
||||
.layer(
|
||||
TraceLayer::new_for_http()
|
||||
.on_request(|request: &http::Request<_>, _span: &Span| {
|
||||
let request_id = request
|
||||
.headers()
|
||||
.get(X_REQUEST_ID)
|
||||
.unwrap()
|
||||
.to_str()
|
||||
.unwrap();
|
||||
|
||||
info!(
|
||||
%request_id,
|
||||
code = response.status().as_u16(),
|
||||
latency = latency.as_millis()
|
||||
)
|
||||
},
|
||||
),
|
||||
)
|
||||
.layer(PropagateRequestIdLayer::x_request_id()),
|
||||
)
|
||||
info!(%request_id, "{} {}", request.method(), request.uri());
|
||||
})
|
||||
.on_response(
|
||||
|response: &http::Response<_>, latency: Duration, _span: &Span| {
|
||||
let request_id = response
|
||||
.headers()
|
||||
.get(X_REQUEST_ID)
|
||||
.unwrap()
|
||||
.to_str()
|
||||
.unwrap();
|
||||
|
||||
info!(
|
||||
%request_id,
|
||||
code = response.status().as_u16(),
|
||||
latency = latency.as_millis()
|
||||
);
|
||||
},
|
||||
),
|
||||
)
|
||||
.layer(PropagateRequestIdLayer::x_request_id()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,15 +167,15 @@ impl Server {
|
||||
match self {
|
||||
// TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
|
||||
// allow binding to localhost
|
||||
Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
|
||||
Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
|
||||
Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
|
||||
Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
|
||||
}
|
||||
}
|
||||
|
||||
fn port(self) -> u16 {
|
||||
fn port(&self) -> u16 {
|
||||
match self {
|
||||
Server::Internal(port) => port,
|
||||
Server::External(port) => port,
|
||||
Server::Internal { port, .. } => *port,
|
||||
Server::External { port, .. } => *port,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,7 +202,9 @@ impl Server {
|
||||
);
|
||||
}
|
||||
|
||||
let router = Router::from(self).with_state(compute);
|
||||
let router = Router::from(&self)
|
||||
.with_state(compute)
|
||||
.into_make_service_with_connect_info::<SocketAddr>();
|
||||
|
||||
if let Err(e) = axum::serve(listener, router).await {
|
||||
error!("compute_ctl {} HTTP server error: {}", self, e);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use compute_api::responses::{InstalledExtension, InstalledExtensions};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::Result;
|
||||
use compute_api::responses::{InstalledExtension, InstalledExtensions};
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::metrics::INSTALLED_EXTENSIONS;
|
||||
|
||||
@@ -21,6 +21,7 @@ mod migration;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
pub mod rsyslog;
|
||||
pub mod spec;
|
||||
mod spec_apply;
|
||||
pub mod swap;
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
use tracing::info;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::prelude::*;
|
||||
|
||||
@@ -42,3 +44,50 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
|
||||
pub fn inlinify(s: &str) -> String {
|
||||
s.replace('\n', "\u{200B}")
|
||||
}
|
||||
|
||||
pub fn startup_context_from_env() -> Option<opentelemetry::Context> {
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
//
|
||||
// This is used to propagate the context for the 'start_compute' operation
|
||||
// from the neon control plane. This allows linking together the wider
|
||||
// 'start_compute' operation that creates the compute container, with the
|
||||
// startup actions here within the container.
|
||||
//
|
||||
// There is no standard for passing context in env variables, but a lot of
|
||||
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
|
||||
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
|
||||
//
|
||||
// Switch to the startup context here, and exit it once the startup has
|
||||
// completed and Postgres is up and running.
|
||||
//
|
||||
// If this pod is pre-created without binding it to any particular endpoint
|
||||
// yet, this isn't the right place to enter the startup context. In that
|
||||
// case, the control plane should pass the tracing context as part of the
|
||||
// /configure API call.
|
||||
//
|
||||
// NOTE: This is supposed to only cover the *startup* actions. Once
|
||||
// postgres is configured and up-and-running, we exit this span. Any other
|
||||
// actions that are performed on incoming HTTP requests, for example, are
|
||||
// performed in separate spans.
|
||||
//
|
||||
// XXX: If the pod is restarted, we perform the startup actions in the same
|
||||
// context as the original startup actions, which probably doesn't make
|
||||
// sense.
|
||||
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
|
||||
if let Ok(val) = std::env::var("TRACEPARENT") {
|
||||
startup_tracing_carrier.insert("traceparent".to_string(), val);
|
||||
}
|
||||
if let Ok(val) = std::env::var("TRACESTATE") {
|
||||
startup_tracing_carrier.insert("tracestate".to_string(), val);
|
||||
}
|
||||
if !startup_tracing_carrier.is_empty() {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry_sdk::propagation::TraceContextPropagator;
|
||||
info!("got startup tracing context from env variables");
|
||||
Some(TraceContextPropagator::new().extract(&startup_tracing_carrier))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +1,15 @@
|
||||
use anyhow::bail;
|
||||
use anyhow::Result;
|
||||
use postgres::{NoTls, SimpleQueryMessage};
|
||||
use std::time::SystemTime;
|
||||
use std::{str::FromStr, sync::Arc, thread, time::Duration};
|
||||
use utils::id::TenantId;
|
||||
use utils::id::TimelineId;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use postgres::{NoTls, SimpleQueryMessage};
|
||||
use tracing::{info, warn};
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardNumber, TenantShardId},
|
||||
};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use metrics::core::Collector;
|
||||
use metrics::proto::MetricFamily;
|
||||
use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec};
|
||||
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
@@ -54,9 +54,7 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
|
||||
register_int_counter_vec!(
|
||||
"compute_ctl_remote_ext_requests_total",
|
||||
"Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
|
||||
// Do not use any labels like extension name yet.
|
||||
// We can add them later if needed.
|
||||
&["http_status"]
|
||||
&["http_status", "filename"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
use std::sync::Arc;
|
||||
use std::{thread, time::Duration};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use compute_api::spec::ComputeFeature;
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use compute_api::spec::ComputeFeature;
|
||||
|
||||
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
|
||||
@@ -17,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
// should be handled gracefully.
|
||||
fn watch_compute_activity(compute: &ComputeNode) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = compute.connstr.clone();
|
||||
let connstr = compute.params.connstr.clone();
|
||||
let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
|
||||
|
||||
// During startup and configuration we connect to every Postgres database,
|
||||
|
||||
@@ -9,7 +9,8 @@ use std::process::Child;
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use anyhow::{Result, bail};
|
||||
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
use futures::StreamExt;
|
||||
use ini::Ini;
|
||||
use notify::{RecursiveMode, Watcher};
|
||||
@@ -21,8 +22,6 @@ use tokio_postgres;
|
||||
use tokio_postgres::NoTls;
|
||||
use tracing::{debug, error, info, instrument};
|
||||
|
||||
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Escape a string for including it in a SQL literal.
|
||||
@@ -187,15 +186,40 @@ impl DatabaseExt for Database {
|
||||
/// Postgres SQL queries and DATABASE_URL.
|
||||
pub trait Escaping {
|
||||
fn pg_quote(&self) -> String;
|
||||
fn pg_quote_dollar(&self) -> (String, String);
|
||||
}
|
||||
|
||||
impl Escaping for PgIdent {
|
||||
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
|
||||
/// always quotes provided string with `""` and escapes every `"`.
|
||||
/// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
|
||||
/// N.B. it's not useful for escaping identifiers that are used inside WHERE
|
||||
/// clause, use `escape_literal()` instead.
|
||||
fn pg_quote(&self) -> String {
|
||||
let result = format!("\"{}\"", self.replace('"', "\"\""));
|
||||
result
|
||||
format!("\"{}\"", self.replace('"', "\"\""))
|
||||
}
|
||||
|
||||
/// This helper is intended to be used for dollar-escaping strings for usage
|
||||
/// inside PL/pgSQL procedures. In addition to dollar-escaping the string,
|
||||
/// it also returns a tag that is intended to be used inside the outer
|
||||
/// PL/pgSQL procedure. If you do not need an outer tag, just discard it.
|
||||
/// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`,
|
||||
/// <https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L2924>
|
||||
fn pg_quote_dollar(&self) -> (String, String) {
|
||||
let mut tag: String = "".to_string();
|
||||
let mut outer_tag = "x".to_string();
|
||||
|
||||
// Find the first suitable tag that is not present in the string.
|
||||
// Postgres' max role/DB name length is 63 bytes, so even in the
|
||||
// worst case it won't take long.
|
||||
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
|
||||
tag += "x";
|
||||
outer_tag = tag.clone() + "x";
|
||||
}
|
||||
|
||||
let escaped = format!("${tag}${self}${tag}$");
|
||||
|
||||
(escaped, outer_tag)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,10 +251,13 @@ pub async fn get_existing_dbs_async(
|
||||
// invalid state. See:
|
||||
// https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
|
||||
let rowstream = client
|
||||
// We use a subquery instead of a fancy `datdba::regrole::text AS owner`,
|
||||
// because the latter automatically wraps the result in double quotes,
|
||||
// if the role name contains special characters.
|
||||
.query_raw::<str, &String, &[String; 0]>(
|
||||
"SELECT
|
||||
datname AS name,
|
||||
datdba::regrole::text AS owner,
|
||||
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
|
||||
NOT datallowconn AS restrict_conn,
|
||||
datconnlimit = - 2 AS invalid
|
||||
FROM
|
||||
|
||||
77
compute_tools/src/rsyslog.rs
Normal file
77
compute_tools/src/rsyslog.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
use std::process::Command;
|
||||
use std::{fs::OpenOptions, io::Write};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use tracing::info;
|
||||
|
||||
fn get_rsyslog_pid() -> Option<String> {
|
||||
let output = Command::new("pgrep")
|
||||
.arg("rsyslogd")
|
||||
.output()
|
||||
.expect("Failed to execute pgrep");
|
||||
|
||||
if !output.stdout.is_empty() {
|
||||
let pid = std::str::from_utf8(&output.stdout)
|
||||
.expect("Invalid UTF-8 in process output")
|
||||
.trim()
|
||||
.to_string();
|
||||
Some(pid)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// Restart rsyslogd to apply the new configuration.
|
||||
// This is necessary, because there is no other way to reload the rsyslog configuration.
|
||||
//
|
||||
// Rsyslogd shouldn't lose any messages, because of the restart,
|
||||
// because it tracks the last read position in the log files
|
||||
// and will continue reading from that position.
|
||||
// TODO: test it properly
|
||||
//
|
||||
fn restart_rsyslog() -> Result<()> {
|
||||
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
|
||||
info!("rsyslogd is running with pid: {}, restart it", old_pid);
|
||||
|
||||
// kill it to restart
|
||||
let _ = Command::new("pkill")
|
||||
.arg("rsyslogd")
|
||||
.output()
|
||||
.context("Failed to stop rsyslogd")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn configure_audit_rsyslog(
|
||||
log_directory: &str,
|
||||
tag: &str,
|
||||
remote_endpoint: &str,
|
||||
) -> Result<()> {
|
||||
let config_content: String = format!(
|
||||
include_str!("config_template/compute_audit_rsyslog_template.conf"),
|
||||
log_directory = log_directory,
|
||||
tag = tag,
|
||||
remote_endpoint = remote_endpoint
|
||||
);
|
||||
|
||||
info!("rsyslog config_content: {}", config_content);
|
||||
|
||||
let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf";
|
||||
let mut file = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.truncate(true)
|
||||
.open(rsyslog_conf_path)?;
|
||||
|
||||
file.write_all(config_content.as_bytes())?;
|
||||
|
||||
info!(
|
||||
"rsyslog configuration file {} added successfully. Starting rsyslogd",
|
||||
rsyslog_conf_path
|
||||
);
|
||||
|
||||
// start the service, using the configuration
|
||||
restart_rsyslog()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,20 +1,20 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use reqwest::StatusCode;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use tokio_postgres::Client;
|
||||
use tracing::{error, info, instrument, warn};
|
||||
|
||||
use crate::config;
|
||||
use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
|
||||
use crate::migration::MigrationRunner;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
use anyhow::{Result, anyhow, bail};
|
||||
use compute_api::responses::{
|
||||
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
|
||||
};
|
||||
use compute_api::spec::ComputeSpec;
|
||||
use reqwest::StatusCode;
|
||||
use tokio_postgres::Client;
|
||||
use tracing::{error, info, instrument, warn};
|
||||
|
||||
use crate::config;
|
||||
use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS};
|
||||
use crate::migration::MigrationRunner;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
// Do control plane request and return response if any. In case of error it
|
||||
// returns a bool flag indicating whether it makes sense to retry the request
|
||||
@@ -141,7 +141,6 @@ pub fn get_spec_from_control_plane(
|
||||
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
||||
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of spec.json
|
||||
info!("checking pg_hba.conf");
|
||||
let pghba_path = pgdata_path.join("pg_hba.conf");
|
||||
|
||||
if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
|
||||
@@ -156,12 +155,11 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
/// Create a standby.signal file
|
||||
pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of spec.json
|
||||
info!("adding standby.signal");
|
||||
let signalfile = pgdata_path.join("standby.signal");
|
||||
|
||||
if !signalfile.exists() {
|
||||
info!("created standby.signal");
|
||||
File::create(signalfile)?;
|
||||
info!("created standby.signal");
|
||||
} else {
|
||||
info!("reused pre-existing standby.signal");
|
||||
}
|
||||
@@ -170,7 +168,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade");
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
client.simple_query(query).await?;
|
||||
|
||||
@@ -1,18 +1,430 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::future::Future;
|
||||
use std::iter::empty;
|
||||
use std::iter::once;
|
||||
use std::iter::{empty, once};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::compute::construct_superuser_query;
|
||||
use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt};
|
||||
use anyhow::Result;
|
||||
use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
|
||||
use anyhow::{Context, Result};
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role};
|
||||
use futures::future::join_all;
|
||||
use tokio::sync::RwLock;
|
||||
use tokio_postgres::Client;
|
||||
use tracing::{debug, info_span, warn, Instrument};
|
||||
use tokio_postgres::error::SqlState;
|
||||
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeState};
|
||||
use crate::pg_helpers::{
|
||||
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
|
||||
get_existing_roles_async,
|
||||
};
|
||||
use crate::spec_apply::ApplySpecPhase::{
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
|
||||
CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
|
||||
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
|
||||
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
|
||||
RunInEachDatabase,
|
||||
};
|
||||
use crate::spec_apply::PerDatabasePhase::{
|
||||
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
|
||||
};
|
||||
|
||||
impl ComputeNode {
|
||||
/// Apply the spec to the running PostgreSQL instance.
|
||||
/// The caller can decide to run with multiple clients in parallel, or
|
||||
/// single mode. Either way, the commands executed will be the same, and
|
||||
/// only commands run in different databases are parallelized.
|
||||
#[instrument(skip_all)]
|
||||
pub fn apply_spec_sql(
|
||||
&self,
|
||||
spec: Arc<ComputeSpec>,
|
||||
conf: Arc<tokio_postgres::Config>,
|
||||
concurrency: usize,
|
||||
) -> Result<()> {
|
||||
info!("Applying config with max {} concurrency", concurrency);
|
||||
debug!("Config: {:?}", spec);
|
||||
|
||||
let rt = tokio::runtime::Handle::current();
|
||||
rt.block_on(async {
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
let client = Self::get_maintenance_client(&conf).await?;
|
||||
let spec = spec.clone();
|
||||
|
||||
let databases = get_existing_dbs_async(&client).await?;
|
||||
let roles = get_existing_roles_async(&client)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|role| (role.name.clone(), role))
|
||||
.collect::<HashMap<String, Role>>();
|
||||
|
||||
// Check if we need to drop subscriptions before starting the endpoint.
|
||||
//
|
||||
// It is important to do this operation exactly once when endpoint starts on a new branch.
|
||||
// Otherwise, we may drop not inherited, but newly created subscriptions.
|
||||
//
|
||||
// We cannot rely only on spec.drop_subscriptions_before_start flag,
|
||||
// because if for some reason compute restarts inside VM,
|
||||
// it will start again with the same spec and flag value.
|
||||
//
|
||||
// To handle this, we save the fact of the operation in the database
|
||||
// in the neon.drop_subscriptions_done table.
|
||||
// If the table does not exist, we assume that the operation was never performed, so we must do it.
|
||||
// If table exists, we check if the operation was performed on the current timelilne.
|
||||
//
|
||||
let mut drop_subscriptions_done = false;
|
||||
|
||||
if spec.drop_subscriptions_before_start {
|
||||
let timeline_id = self.get_timeline_id().context("timeline_id must be set")?;
|
||||
let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id);
|
||||
|
||||
info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
|
||||
|
||||
drop_subscriptions_done = match
|
||||
client.simple_query(&query).await {
|
||||
Ok(result) => {
|
||||
matches!(&result[0], postgres::SimpleQueryMessage::Row(_))
|
||||
},
|
||||
Err(e) =>
|
||||
{
|
||||
match e.code() {
|
||||
Some(&SqlState::UNDEFINED_TABLE) => false,
|
||||
_ => {
|
||||
// We don't expect any other error here, except for the schema/table not existing
|
||||
error!("Error checking if drop subscription operation was already performed: {}", e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
let jwks_roles = Arc::new(
|
||||
spec.as_ref()
|
||||
.local_proxy_config
|
||||
.iter()
|
||||
.flat_map(|it| &it.jwks)
|
||||
.flatten()
|
||||
.flat_map(|setting| &setting.role_names)
|
||||
.cloned()
|
||||
.collect::<HashSet<_>>(),
|
||||
);
|
||||
|
||||
let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext {
|
||||
roles,
|
||||
dbs: databases,
|
||||
}));
|
||||
|
||||
// Apply special pre drop database phase.
|
||||
// NOTE: we use the code of RunInEachDatabase phase for parallelism
|
||||
// and connection management, but we don't really run it in *each* database,
|
||||
// only in databases, we're about to drop.
|
||||
info!("Applying PerDatabase (pre-dropdb) phase");
|
||||
let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
|
||||
|
||||
// Run the phase for each database that we're about to drop.
|
||||
let db_processes = spec
|
||||
.delta_operations
|
||||
.iter()
|
||||
.flatten()
|
||||
.filter_map(move |op| {
|
||||
if op.action.as_str() == "delete_db" {
|
||||
Some(op.name.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.map(|dbname| {
|
||||
let spec = spec.clone();
|
||||
let ctx = ctx.clone();
|
||||
let jwks_roles = jwks_roles.clone();
|
||||
let mut conf = conf.as_ref().clone();
|
||||
let concurrency_token = concurrency_token.clone();
|
||||
// We only need dbname field for this phase, so set other fields to dummy values
|
||||
let db = DB::UserDB(Database {
|
||||
name: dbname.clone(),
|
||||
owner: "cloud_admin".to_string(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
});
|
||||
|
||||
debug!("Applying per-database phases for Database {:?}", &db);
|
||||
|
||||
match &db {
|
||||
DB::SystemDB => {}
|
||||
DB::UserDB(db) => {
|
||||
conf.dbname(db.name.as_str());
|
||||
}
|
||||
}
|
||||
|
||||
let conf = Arc::new(conf);
|
||||
let fut = Self::apply_spec_sql_db(
|
||||
spec.clone(),
|
||||
conf,
|
||||
ctx.clone(),
|
||||
jwks_roles.clone(),
|
||||
concurrency_token.clone(),
|
||||
db,
|
||||
[DropLogicalSubscriptions].to_vec(),
|
||||
);
|
||||
|
||||
Ok(tokio::spawn(fut))
|
||||
})
|
||||
.collect::<Vec<Result<_, anyhow::Error>>>();
|
||||
|
||||
for process in db_processes.into_iter() {
|
||||
let handle = process?;
|
||||
if let Err(e) = handle.await? {
|
||||
// Handle the error case where the database does not exist
|
||||
// We do not check whether the DB exists or not in the deletion phase,
|
||||
// so we shouldn't be strict about it in pre-deletion cleanup as well.
|
||||
if e.to_string().contains("does not exist") {
|
||||
warn!("Error dropping subscription: {}", e);
|
||||
} else {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
for phase in [
|
||||
CreateNeonSuperuser,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
RenameAndDeleteDatabases,
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
] {
|
||||
info!("Applying phase {:?}", &phase);
|
||||
apply_operations(
|
||||
spec.clone(),
|
||||
ctx.clone(),
|
||||
jwks_roles.clone(),
|
||||
phase,
|
||||
|| async { Ok(&client) },
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
info!("Applying RunInEachDatabase2 phase");
|
||||
let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
|
||||
|
||||
let db_processes = spec
|
||||
.cluster
|
||||
.databases
|
||||
.iter()
|
||||
.map(|db| DB::new(db.clone()))
|
||||
// include
|
||||
.chain(once(DB::SystemDB))
|
||||
.map(|db| {
|
||||
let spec = spec.clone();
|
||||
let ctx = ctx.clone();
|
||||
let jwks_roles = jwks_roles.clone();
|
||||
let mut conf = conf.as_ref().clone();
|
||||
let concurrency_token = concurrency_token.clone();
|
||||
let db = db.clone();
|
||||
|
||||
debug!("Applying per-database phases for Database {:?}", &db);
|
||||
|
||||
match &db {
|
||||
DB::SystemDB => {}
|
||||
DB::UserDB(db) => {
|
||||
conf.dbname(db.name.as_str());
|
||||
}
|
||||
}
|
||||
|
||||
let conf = Arc::new(conf);
|
||||
let mut phases = vec![
|
||||
DeleteDBRoleReferences,
|
||||
ChangeSchemaPerms,
|
||||
HandleAnonExtension,
|
||||
];
|
||||
|
||||
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
|
||||
info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
|
||||
phases.push(DropLogicalSubscriptions);
|
||||
}
|
||||
|
||||
let fut = Self::apply_spec_sql_db(
|
||||
spec.clone(),
|
||||
conf,
|
||||
ctx.clone(),
|
||||
jwks_roles.clone(),
|
||||
concurrency_token.clone(),
|
||||
db,
|
||||
phases,
|
||||
);
|
||||
|
||||
Ok(tokio::spawn(fut))
|
||||
})
|
||||
.collect::<Vec<Result<_, anyhow::Error>>>();
|
||||
|
||||
for process in db_processes.into_iter() {
|
||||
let handle = process?;
|
||||
handle.await??;
|
||||
}
|
||||
|
||||
let mut phases = vec![
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension, // This step depends on CreateSchemaNeon
|
||||
CreateAvailabilityCheck,
|
||||
DropRoles,
|
||||
];
|
||||
|
||||
// This step depends on CreateSchemaNeon
|
||||
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
|
||||
info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
|
||||
phases.push(FinalizeDropLogicalSubscriptions);
|
||||
}
|
||||
|
||||
// Keep DisablePostgresDBPgAudit phase at the end,
|
||||
// so that all config operations are audit logged.
|
||||
match spec.audit_log_level
|
||||
{
|
||||
ComputeAudit::Hipaa => {
|
||||
phases.push(CreatePgauditExtension);
|
||||
phases.push(CreatePgauditlogtofileExtension);
|
||||
phases.push(DisablePostgresDBPgAudit);
|
||||
}
|
||||
ComputeAudit::Log => { /* not implemented yet */ }
|
||||
ComputeAudit::Disabled => {}
|
||||
}
|
||||
|
||||
for phase in phases {
|
||||
debug!("Applying phase {:?}", &phase);
|
||||
apply_operations(
|
||||
spec.clone(),
|
||||
ctx.clone(),
|
||||
jwks_roles.clone(),
|
||||
phase,
|
||||
|| async { Ok(&client) },
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok::<(), anyhow::Error>(())
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Apply SQL migrations of the RunInEachDatabase phase.
|
||||
///
|
||||
/// May opt to not connect to databases that don't have any scheduled
|
||||
/// operations. The function is concurrency-controlled with the provided
|
||||
/// semaphore. The caller has to make sure the semaphore isn't exhausted.
|
||||
async fn apply_spec_sql_db(
|
||||
spec: Arc<ComputeSpec>,
|
||||
conf: Arc<tokio_postgres::Config>,
|
||||
ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>,
|
||||
jwks_roles: Arc<HashSet<String>>,
|
||||
concurrency_token: Arc<tokio::sync::Semaphore>,
|
||||
db: DB,
|
||||
subphases: Vec<PerDatabasePhase>,
|
||||
) -> Result<()> {
|
||||
let _permit = concurrency_token.acquire().await?;
|
||||
|
||||
let mut client_conn = None;
|
||||
|
||||
for subphase in subphases {
|
||||
apply_operations(
|
||||
spec.clone(),
|
||||
ctx.clone(),
|
||||
jwks_roles.clone(),
|
||||
RunInEachDatabase {
|
||||
db: db.clone(),
|
||||
subphase,
|
||||
},
|
||||
// Only connect if apply_operation actually wants a connection.
|
||||
// It's quite possible this database doesn't need any queries,
|
||||
// so by not connecting we save time and effort connecting to
|
||||
// that database.
|
||||
|| async {
|
||||
if client_conn.is_none() {
|
||||
let db_client = Self::get_maintenance_client(&conf).await?;
|
||||
client_conn.replace(db_client);
|
||||
}
|
||||
let client = client_conn.as_ref().unwrap();
|
||||
Ok(client)
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
drop(client_conn);
|
||||
|
||||
Ok::<(), anyhow::Error>(())
|
||||
}
|
||||
|
||||
/// Choose how many concurrent connections to use for applying the spec changes.
|
||||
pub fn max_service_connections(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
spec: &ComputeSpec,
|
||||
) -> usize {
|
||||
// If the cluster is in Init state we don't have to deal with user connections,
|
||||
// and can thus use all `max_connections` connection slots. However, that's generally not
|
||||
// very efficient, so we generally still limit it to a smaller number.
|
||||
if compute_state.status == ComputeStatus::Init {
|
||||
// If the settings contain 'max_connections', use that as template
|
||||
if let Some(config) = spec.cluster.settings.find("max_connections") {
|
||||
config.parse::<usize>().ok()
|
||||
} else {
|
||||
// Otherwise, try to find the setting in the postgresql_conf string
|
||||
spec.cluster
|
||||
.postgresql_conf
|
||||
.iter()
|
||||
.flat_map(|conf| conf.split("\n"))
|
||||
.filter_map(|line| {
|
||||
if !line.contains("max_connections") {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (key, value) = line.split_once("=")?;
|
||||
let key = key
|
||||
.trim_start_matches(char::is_whitespace)
|
||||
.trim_end_matches(char::is_whitespace);
|
||||
|
||||
let value = value
|
||||
.trim_start_matches(char::is_whitespace)
|
||||
.trim_end_matches(char::is_whitespace);
|
||||
|
||||
if key != "max_connections" {
|
||||
return None;
|
||||
}
|
||||
|
||||
value.parse::<usize>().ok()
|
||||
})
|
||||
.next()
|
||||
}
|
||||
// If max_connections is present, use at most 1/3rd of that.
|
||||
// When max_connections is lower than 30, try to use at least 10 connections, but
|
||||
// never more than max_connections.
|
||||
.map(|limit| match limit {
|
||||
0..10 => limit,
|
||||
10..30 => 10,
|
||||
30.. => limit / 3,
|
||||
})
|
||||
// If we didn't find max_connections, default to 10 concurrent connections.
|
||||
.unwrap_or(10)
|
||||
} else {
|
||||
// state == Running
|
||||
// Because the cluster is already in the Running state, we should assume users are
|
||||
// already connected to the cluster, and high concurrency could negatively
|
||||
// impact user connectivity. Therefore, we can limit concurrency to the number of
|
||||
// reserved superuser connections, which users wouldn't be able to use anyway.
|
||||
spec.cluster
|
||||
.settings
|
||||
.find("superuser_reserved_connections")
|
||||
.iter()
|
||||
.filter_map(|val| val.parse::<usize>().ok())
|
||||
.map(|val| if val > 1 { val - 1 } else { 1 })
|
||||
.last()
|
||||
.unwrap_or(3)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum DB {
|
||||
@@ -57,7 +469,7 @@ pub enum PerDatabasePhase {
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ApplySpecPhase {
|
||||
CreateSuperUser,
|
||||
CreateNeonSuperuser,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
@@ -65,6 +477,9 @@ pub enum ApplySpecPhase {
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
|
||||
CreatePgauditExtension,
|
||||
CreatePgauditlogtofileExtension,
|
||||
DisablePostgresDBPgAudit,
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension,
|
||||
CreateAvailabilityCheck,
|
||||
@@ -181,14 +596,10 @@ async fn get_operations<'a>(
|
||||
apply_spec_phase: &'a ApplySpecPhase,
|
||||
) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
|
||||
match apply_spec_phase {
|
||||
ApplySpecPhase::CreateSuperUser => {
|
||||
let query = construct_superuser_query(spec);
|
||||
|
||||
Ok(Box::new(once(Operation {
|
||||
query,
|
||||
comment: None,
|
||||
})))
|
||||
}
|
||||
ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
|
||||
query: include_str!("sql/create_neon_superuser.sql").to_string(),
|
||||
comment: None,
|
||||
}))),
|
||||
ApplySpecPhase::DropInvalidDatabases => {
|
||||
let mut ctx = ctx.write().await;
|
||||
let databases = &mut ctx.dbs;
|
||||
@@ -322,14 +733,15 @@ async fn get_operations<'a>(
|
||||
// We do not check whether the DB exists or not,
|
||||
// Postgres will take care of it for us
|
||||
"delete_db" => {
|
||||
let (db_name, outer_tag) = op.name.pg_quote_dollar();
|
||||
// In Postgres we can't drop a database if it is a template.
|
||||
// So we need to unset the template flag first, but it could
|
||||
// be a retry, so we could've already dropped the database.
|
||||
// Check that database exists first to make it idempotent.
|
||||
let unset_template_query: String = format!(
|
||||
include_str!("sql/unset_template_for_drop_dbs.sql"),
|
||||
datname_str = escape_literal(&op.name),
|
||||
datname = &op.name.pg_quote()
|
||||
datname = db_name,
|
||||
outer_tag = outer_tag,
|
||||
);
|
||||
|
||||
// Use FORCE to drop database even if there are active connections.
|
||||
@@ -436,6 +848,8 @@ async fn get_operations<'a>(
|
||||
comment: None,
|
||||
},
|
||||
Operation {
|
||||
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
|
||||
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
|
||||
query: format!(
|
||||
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
|
||||
db.name.pg_quote()
|
||||
@@ -474,7 +888,10 @@ async fn get_operations<'a>(
|
||||
let edb = match databases.get(&db.name) {
|
||||
Some(edb) => edb,
|
||||
None => {
|
||||
warn!("skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", subphase, db.name);
|
||||
warn!(
|
||||
"skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL",
|
||||
subphase, db.name
|
||||
);
|
||||
return Ok(Box::new(empty()));
|
||||
}
|
||||
};
|
||||
@@ -492,9 +909,11 @@ async fn get_operations<'a>(
|
||||
PerDatabasePhase::DropLogicalSubscriptions => {
|
||||
match &db {
|
||||
DB::UserDB(db) => {
|
||||
let (db_name, outer_tag) = db.name.pg_quote_dollar();
|
||||
let drop_subscription_query: String = format!(
|
||||
include_str!("sql/drop_subscriptions.sql"),
|
||||
datname_str = escape_literal(&db.name),
|
||||
datname_str = db_name,
|
||||
outer_tag = outer_tag,
|
||||
);
|
||||
|
||||
let operations = vec![Operation {
|
||||
@@ -533,6 +952,7 @@ async fn get_operations<'a>(
|
||||
DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
|
||||
DB::UserDB(db) => db.owner.pg_quote(),
|
||||
};
|
||||
let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
|
||||
|
||||
Some(vec![
|
||||
// This will reassign all dependent objects to the db owner
|
||||
@@ -547,7 +967,9 @@ async fn get_operations<'a>(
|
||||
Operation {
|
||||
query: format!(
|
||||
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
|
||||
role_name = quoted,
|
||||
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
role_name = escaped_role,
|
||||
outer_tag = outer_tag,
|
||||
),
|
||||
comment: None,
|
||||
},
|
||||
@@ -572,12 +994,14 @@ async fn get_operations<'a>(
|
||||
DB::SystemDB => return Ok(Box::new(empty())),
|
||||
DB::UserDB(db) => db,
|
||||
};
|
||||
let (db_owner, outer_tag) = db.owner.pg_quote_dollar();
|
||||
|
||||
let operations = vec![
|
||||
Operation {
|
||||
query: format!(
|
||||
include_str!("sql/set_public_schema_owner.sql"),
|
||||
db_owner = db.owner.pg_quote()
|
||||
db_owner = db_owner,
|
||||
outer_tag = outer_tag,
|
||||
),
|
||||
comment: None,
|
||||
},
|
||||
@@ -697,6 +1121,25 @@ async fn get_operations<'a>(
|
||||
}
|
||||
Ok(Box::new(empty()))
|
||||
}
|
||||
ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
|
||||
comment: Some(String::from("create pgaudit extensions")),
|
||||
}))),
|
||||
ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
|
||||
comment: Some(String::from("create pgauditlogtofile extensions")),
|
||||
}))),
|
||||
// Disable pgaudit logging for postgres database.
|
||||
// Postgres is neon system database used by monitors
|
||||
// and compute_ctl tuning functions and thus generates a lot of noise.
|
||||
// We do not consider data stored in this database as sensitive.
|
||||
ApplySpecPhase::DisablePostgresDBPgAudit => {
|
||||
let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'";
|
||||
Ok(Box::new(once(Operation {
|
||||
query: query.to_string(),
|
||||
comment: Some(query.to_string()),
|
||||
})))
|
||||
}
|
||||
ApplySpecPhase::HandleNeonExtension => {
|
||||
let operations = vec![
|
||||
Operation {
|
||||
|
||||
8
compute_tools/src/sql/create_neon_superuser.sql
Normal file
8
compute_tools/src/sql/create_neon_superuser.sql
Normal file
@@ -0,0 +1,8 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
@@ -1,4 +1,4 @@
|
||||
DO $$
|
||||
DO ${outer_tag}$
|
||||
DECLARE
|
||||
subname TEXT;
|
||||
BEGIN
|
||||
@@ -9,4 +9,4 @@ BEGIN
|
||||
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
|
||||
END LOOP;
|
||||
END;
|
||||
$$;
|
||||
${outer_tag}$;
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
SET SESSION ROLE neon_superuser;
|
||||
|
||||
DO $$
|
||||
DO ${outer_tag}$
|
||||
DECLARE
|
||||
schema TEXT;
|
||||
grantor TEXT;
|
||||
revoke_query TEXT;
|
||||
BEGIN
|
||||
FOR schema IN
|
||||
@@ -15,14 +14,25 @@ BEGIN
|
||||
-- ii) it's easy to add more schemas to the list if needed.
|
||||
WHERE schema_name IN ('public')
|
||||
LOOP
|
||||
revoke_query := format(
|
||||
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
|
||||
schema
|
||||
);
|
||||
FOR grantor IN EXECUTE
|
||||
format(
|
||||
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
|
||||
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
quote_literal({role_name})
|
||||
)
|
||||
LOOP
|
||||
EXECUTE format('SET LOCAL ROLE %I', grantor);
|
||||
|
||||
EXECUTE revoke_query;
|
||||
revoke_query := format(
|
||||
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
|
||||
schema,
|
||||
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
{role_name},
|
||||
grantor
|
||||
);
|
||||
|
||||
EXECUTE revoke_query;
|
||||
END LOOP;
|
||||
END LOOP;
|
||||
END;
|
||||
$$;
|
||||
|
||||
RESET ROLE;
|
||||
${outer_tag}$;
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
DO
|
||||
$$
|
||||
DO ${outer_tag}$
|
||||
DECLARE
|
||||
schema_owner TEXT;
|
||||
BEGIN
|
||||
@@ -16,8 +15,8 @@ $$
|
||||
|
||||
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
|
||||
THEN
|
||||
ALTER SCHEMA public OWNER TO {db_owner};
|
||||
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
|
||||
END IF;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
${outer_tag}$;
|
||||
@@ -1,12 +1,12 @@
|
||||
DO $$
|
||||
DO ${outer_tag}$
|
||||
BEGIN
|
||||
IF EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_database
|
||||
WHERE datname = {datname_str}
|
||||
WHERE datname = {datname}
|
||||
)
|
||||
THEN
|
||||
ALTER DATABASE {datname} is_template false;
|
||||
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
${outer_tag}$;
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use tracing::warn;
|
||||
use anyhow::{Context, anyhow};
|
||||
use tracing::{instrument, warn};
|
||||
|
||||
pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
|
||||
|
||||
#[instrument]
|
||||
pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
|
||||
// run `/neonvm/bin/resize-swap --once {size_bytes}`
|
||||
//
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#[cfg(test)]
|
||||
mod config_tests {
|
||||
|
||||
use std::fs::{remove_file, File};
|
||||
use std::fs::{File, remove_file};
|
||||
use std::io::{Read, Write};
|
||||
use std::path::Path;
|
||||
|
||||
|
||||
@@ -61,6 +61,23 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
|
||||
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ident_pg_quote_dollar() {
|
||||
let test_cases = vec![
|
||||
("name", ("$$name$$", "x")),
|
||||
("name$$", ("$x$name$$$x$", "xx")),
|
||||
("name$$$", ("$x$name$$$$x$", "xx")),
|
||||
("name$$$$", ("$x$name$$$$$x$", "xx")),
|
||||
("name$x$", ("$xx$name$x$$xx$", "xxx")),
|
||||
];
|
||||
|
||||
for (input, expected) in test_cases {
|
||||
let (escaped, tag) = PgIdent::from(input).pg_quote_dollar();
|
||||
assert_eq!(escaped, expected.0);
|
||||
assert_eq!(tag, expected.1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn generic_options_search() {
|
||||
let generic_options: GenericOptions = Some(vec![
|
||||
|
||||
@@ -25,7 +25,7 @@ use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use nix::errno::Errno;
|
||||
use nix::fcntl::{FcntlArg, FdFlag};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use nix::unistd::Pid;
|
||||
use utils::pid_file::{self, PidFileRead};
|
||||
|
||||
|
||||
@@ -5,7 +5,16 @@
|
||||
//! easier to work with locally. The python tests in `test_runner`
|
||||
//! rely on `neon_local` to set up the environment for each test.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::fs::File;
|
||||
use std::os::fd::AsRawFd;
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use clap::Parser;
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
@@ -19,7 +28,7 @@ use control_plane::storage_controller::{
|
||||
NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
|
||||
};
|
||||
use control_plane::{broker, local_env};
|
||||
use nix::fcntl::{flock, FlockArg};
|
||||
use nix::fcntl::{FlockArg, flock};
|
||||
use pageserver_api::config::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||
@@ -31,27 +40,18 @@ use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInf
|
||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::membership::SafekeeperGeneration;
|
||||
use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||
};
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::fs::File;
|
||||
use std::os::fd::AsRawFd;
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||
use tokio::task::JoinSet;
|
||||
use url::Host;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
project_git_version,
|
||||
};
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::project_git_version;
|
||||
|
||||
// Default id of a safekeeper node, if not specified on the command line.
|
||||
const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1);
|
||||
@@ -597,7 +597,15 @@ struct EndpointStartCmdArgs {
|
||||
#[clap(long = "pageserver-id")]
|
||||
endpoint_pageserver_id: Option<NodeId>,
|
||||
|
||||
#[clap(long)]
|
||||
#[clap(
|
||||
long,
|
||||
help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
|
||||
)]
|
||||
safekeepers_generation: Option<u32>,
|
||||
#[clap(
|
||||
long,
|
||||
help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
|
||||
)]
|
||||
safekeepers: Option<String>,
|
||||
|
||||
#[clap(
|
||||
@@ -618,9 +626,9 @@ struct EndpointStartCmdArgs {
|
||||
)]
|
||||
allow_multiple: bool,
|
||||
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
#[arg(default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
#[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
|
||||
#[arg(default_value = "90s")]
|
||||
start_timeout: Duration,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -921,7 +929,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
let init_conf: NeonLocalInitConf = if let Some(config_path) = &args.config {
|
||||
// User (likely the Python test suite) provided a description of the environment.
|
||||
if args.num_pageservers.is_some() {
|
||||
bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
|
||||
bail!(
|
||||
"Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"
|
||||
);
|
||||
}
|
||||
// load and parse the file
|
||||
let contents = std::fs::read_to_string(config_path).with_context(|| {
|
||||
@@ -953,6 +963,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
id: pageserver_id,
|
||||
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
|
||||
listen_http_addr: format!("127.0.0.1:{http_port}"),
|
||||
listen_https_addr: None,
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
other: Default::default(),
|
||||
@@ -967,6 +978,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
|
||||
storage_controller: None,
|
||||
control_plane_compute_hook_api: None,
|
||||
generate_local_ssl_certs: false,
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1315,10 +1327,14 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
|
||||
match (mode, args.hot_standby) {
|
||||
(ComputeMode::Static(_), true) => {
|
||||
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
|
||||
bail!(
|
||||
"Cannot start a node in hot standby mode when it is already configured as a static replica"
|
||||
)
|
||||
}
|
||||
(ComputeMode::Primary, true) => {
|
||||
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
|
||||
bail!(
|
||||
"Cannot start a node as a hot standby replica, it is already configured as primary node"
|
||||
)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
@@ -1345,6 +1361,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
let pageserver_id = args.endpoint_pageserver_id;
|
||||
let remote_ext_config = &args.remote_ext_config;
|
||||
|
||||
let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
|
||||
@@ -1420,11 +1437,13 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
endpoint
|
||||
.start(
|
||||
&auth_token,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
remote_ext_config.as_ref(),
|
||||
stripe_size.0 as usize,
|
||||
args.create_test_user,
|
||||
args.start_timeout,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
use crate::{background_process, local_env};
|
||||
|
||||
@@ -37,29 +37,24 @@
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::BTreeMap;
|
||||
use std::net::IpAddr;
|
||||
use std::net::Ipv4Addr;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::SystemTime;
|
||||
use std::time::UNIX_EPOCH;
|
||||
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::ComputeCtlConfig;
|
||||
use compute_api::spec::Database;
|
||||
use compute_api::spec::PgIdent;
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use compute_api::spec::Role;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
|
||||
RemoteExtSpec, Role,
|
||||
};
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use safekeeper_api::membership::SafekeeperGeneration;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::debug;
|
||||
use url::Host;
|
||||
@@ -69,9 +64,6 @@ use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage_controller::StorageController;
|
||||
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
|
||||
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
|
||||
|
||||
// contents of a endpoint.json file
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct EndpointConf {
|
||||
@@ -237,7 +229,9 @@ impl ComputeControlPlane {
|
||||
});
|
||||
|
||||
if let Some((key, _)) = duplicates.next() {
|
||||
bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
|
||||
bail!(
|
||||
"attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported."
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -584,14 +578,17 @@ impl Endpoint {
|
||||
Ok(safekeeper_connstrings)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
remote_ext_config: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
start_timeout: Duration,
|
||||
) -> Result<()> {
|
||||
if self.status() == EndpointStatus::Running {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
@@ -663,6 +660,7 @@ impl Endpoint {
|
||||
timeline_id: Some(self.timeline_id),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
@@ -671,6 +669,7 @@ impl Endpoint {
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
@@ -778,17 +777,18 @@ impl Endpoint {
|
||||
std::fs::write(pidfile_path, pid.to_string())?;
|
||||
|
||||
// Wait for it to start
|
||||
let mut attempt = 0;
|
||||
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
|
||||
const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
|
||||
let start_at = Instant::now();
|
||||
loop {
|
||||
attempt += 1;
|
||||
match self.get_status().await {
|
||||
Ok(state) => {
|
||||
match state.status {
|
||||
ComputeStatus::Init => {
|
||||
if attempt == MAX_ATTEMPTS {
|
||||
bail!("compute startup timed out; still in Init state");
|
||||
if Instant::now().duration_since(start_at) > start_timeout {
|
||||
bail!(
|
||||
"compute startup timed out {:?}; still in Init state",
|
||||
start_timeout
|
||||
);
|
||||
}
|
||||
// keep retrying
|
||||
}
|
||||
@@ -815,8 +815,11 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if attempt == MAX_ATTEMPTS {
|
||||
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
|
||||
if Instant::now().duration_since(start_at) > start_timeout {
|
||||
return Err(e).context(format!(
|
||||
"timed out {:?} waiting to connect to compute_ctl HTTP",
|
||||
start_timeout,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,28 +3,22 @@
|
||||
//! Now it also provides init method which acts like a stub for proper installation
|
||||
//! script which will use local paths.
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use std::collections::HashMap;
|
||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::time::Duration;
|
||||
use std::{env, fs};
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
use clap::ValueEnum;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::net::IpAddr;
|
||||
use std::net::Ipv4Addr;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::time::Duration;
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
};
|
||||
use utils::auth::{Claims, encode_from_key_file};
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
|
||||
use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
@@ -87,6 +81,10 @@ pub struct LocalEnv {
|
||||
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
|
||||
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
|
||||
pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
|
||||
/// Flag to generate SSL certificates for components that need it.
|
||||
/// Also generates root CA certificate that is used to sign all other certificates.
|
||||
pub generate_local_ssl_certs: bool,
|
||||
}
|
||||
|
||||
/// On-disk state stored in `.neon/config`.
|
||||
@@ -108,6 +106,10 @@ pub struct OnDiskConfig {
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_compute_hook_api: Option<Url>,
|
||||
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
// Note: skip serializing because in compat tests old storage controller fails
|
||||
// to load new config file. May be removed after this field is in release branch.
|
||||
#[serde(skip_serializing_if = "std::ops::Not::not")]
|
||||
pub generate_local_ssl_certs: bool,
|
||||
}
|
||||
|
||||
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
|
||||
@@ -135,6 +137,7 @@ pub struct NeonLocalInitConf {
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_compute_hook_api: Option<Option<Url>>,
|
||||
pub generate_local_ssl_certs: bool,
|
||||
}
|
||||
|
||||
/// Broker config for cluster internal communication.
|
||||
@@ -171,6 +174,11 @@ pub struct NeonStorageControllerConf {
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub long_reconcile_threshold: Option<Duration>,
|
||||
|
||||
#[serde(default)]
|
||||
pub use_https_pageserver_api: bool,
|
||||
|
||||
pub timelines_onto_safekeepers: bool,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
@@ -194,6 +202,8 @@ impl Default for NeonStorageControllerConf {
|
||||
max_secondary_lag_bytes: None,
|
||||
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
|
||||
long_reconcile_threshold: None,
|
||||
use_https_pageserver_api: false,
|
||||
timelines_onto_safekeepers: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -223,6 +233,7 @@ pub struct PageServerConf {
|
||||
pub id: NodeId,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub listen_https_addr: Option<String>,
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
pub no_sync: bool,
|
||||
@@ -234,6 +245,7 @@ impl Default for PageServerConf {
|
||||
id: NodeId(0),
|
||||
listen_pg_addr: String::new(),
|
||||
listen_http_addr: String::new(),
|
||||
listen_https_addr: None,
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
no_sync: false,
|
||||
@@ -249,6 +261,7 @@ pub struct NeonLocalInitPageserverConf {
|
||||
pub id: NodeId,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub listen_https_addr: Option<String>,
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
|
||||
@@ -263,6 +276,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||
id,
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
@@ -272,6 +286,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||
id: *id,
|
||||
listen_pg_addr: listen_pg_addr.clone(),
|
||||
listen_http_addr: listen_http_addr.clone(),
|
||||
listen_https_addr: listen_https_addr.clone(),
|
||||
pg_auth_type: *pg_auth_type,
|
||||
http_auth_type: *http_auth_type,
|
||||
no_sync: *no_sync,
|
||||
@@ -416,6 +431,41 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ssl_ca_cert_path(&self) -> Option<PathBuf> {
|
||||
if self.generate_local_ssl_certs {
|
||||
Some(self.base_data_dir.join("rootCA.crt"))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ssl_ca_key_path(&self) -> Option<PathBuf> {
|
||||
if self.generate_local_ssl_certs {
|
||||
Some(self.base_data_dir.join("rootCA.key"))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate_ssl_ca_cert(&self) -> anyhow::Result<()> {
|
||||
let cert_path = self.ssl_ca_cert_path().unwrap();
|
||||
let key_path = self.ssl_ca_key_path().unwrap();
|
||||
if !fs::exists(cert_path.as_path())? {
|
||||
generate_ssl_ca_cert(cert_path.as_path(), key_path.as_path())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn generate_ssl_cert(&self, cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
|
||||
self.generate_ssl_ca_cert()?;
|
||||
generate_ssl_cert(
|
||||
cert_path,
|
||||
key_path,
|
||||
self.ssl_ca_cert_path().unwrap().as_path(),
|
||||
self.ssl_ca_key_path().unwrap().as_path(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Inspect the base data directory and extract the instance id and instance directory path
|
||||
/// for all storage controller instances
|
||||
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
|
||||
@@ -465,7 +515,9 @@ impl LocalEnv {
|
||||
if old_timeline_id == &timeline_id {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
|
||||
bail!(
|
||||
"branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"
|
||||
);
|
||||
}
|
||||
} else {
|
||||
existing_values.push((tenant_id, timeline_id));
|
||||
@@ -523,6 +575,7 @@ impl LocalEnv {
|
||||
control_plane_api,
|
||||
control_plane_compute_hook_api,
|
||||
branch_name_mappings,
|
||||
generate_local_ssl_certs,
|
||||
} = on_disk_config;
|
||||
LocalEnv {
|
||||
base_data_dir: repopath.to_owned(),
|
||||
@@ -537,6 +590,7 @@ impl LocalEnv {
|
||||
control_plane_api: control_plane_api.unwrap(),
|
||||
control_plane_compute_hook_api,
|
||||
branch_name_mappings,
|
||||
generate_local_ssl_certs,
|
||||
}
|
||||
};
|
||||
|
||||
@@ -572,6 +626,7 @@ impl LocalEnv {
|
||||
struct PageserverConfigTomlSubset {
|
||||
listen_pg_addr: String,
|
||||
listen_http_addr: String,
|
||||
listen_https_addr: Option<String>,
|
||||
pg_auth_type: AuthType,
|
||||
http_auth_type: AuthType,
|
||||
#[serde(default)]
|
||||
@@ -596,6 +651,7 @@ impl LocalEnv {
|
||||
let PageserverConfigTomlSubset {
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
@@ -613,6 +669,7 @@ impl LocalEnv {
|
||||
},
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
@@ -640,6 +697,7 @@ impl LocalEnv {
|
||||
control_plane_api: Some(self.control_plane_api.clone()),
|
||||
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
|
||||
branch_name_mappings: self.branch_name_mappings.clone(),
|
||||
generate_local_ssl_certs: self.generate_local_ssl_certs,
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -722,6 +780,7 @@ impl LocalEnv {
|
||||
safekeepers,
|
||||
control_plane_api,
|
||||
control_plane_compute_hook_api,
|
||||
generate_local_ssl_certs,
|
||||
} = conf;
|
||||
|
||||
// Find postgres binaries.
|
||||
@@ -770,8 +829,13 @@ impl LocalEnv {
|
||||
control_plane_api: control_plane_api.unwrap(),
|
||||
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
|
||||
branch_name_mappings: Default::default(),
|
||||
generate_local_ssl_certs,
|
||||
};
|
||||
|
||||
if generate_local_ssl_certs {
|
||||
env.generate_ssl_ca_cert()?;
|
||||
}
|
||||
|
||||
// create endpoints dir
|
||||
fs::create_dir_all(env.endpoints_path())?;
|
||||
|
||||
@@ -855,3 +919,80 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
|
||||
// openssl req -x509 -newkey rsa:2048 -nodes -subj "/CN=Neon Local CA" -days 36500 \
|
||||
// -out rootCA.crt -keyout rootCA.key
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args([
|
||||
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
|
||||
])
|
||||
.args(["-subj", "/CN=Neon Local CA"])
|
||||
.args(["-out", cert_path.to_str().unwrap()])
|
||||
.args(["-keyout", key_path.to_str().unwrap()])
|
||||
.output()
|
||||
.context("failed to generate CA certificate")?;
|
||||
if !keygen_output.status.success() {
|
||||
bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn generate_ssl_cert(
|
||||
cert_path: &Path,
|
||||
key_path: &Path,
|
||||
ca_cert_path: &Path,
|
||||
ca_key_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
// Generate Certificate Signing Request (CSR).
|
||||
let mut csr_path = cert_path.to_path_buf();
|
||||
csr_path.set_extension(".csr");
|
||||
|
||||
// openssl req -new -nodes -newkey rsa:2048 -keyout server.key -out server.csr \
|
||||
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args(["req", "-new", "-nodes"])
|
||||
.args(["-newkey", "rsa:2048"])
|
||||
.args(["-subj", "/CN=localhost"])
|
||||
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
|
||||
.args(["-keyout", key_path.to_str().unwrap()])
|
||||
.args(["-out", csr_path.to_str().unwrap()])
|
||||
.output()
|
||||
.context("failed to generate CSR")?;
|
||||
if !keygen_output.status.success() {
|
||||
bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// Sign CSR with CA key.
|
||||
//
|
||||
// openssl x509 -req -in server.csr -CA rootCA.crt -CAkey rootCA.key -CAcreateserial \
|
||||
// -out server.crt -days 36500 -copy_extensions copyall
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args(["x509", "-req"])
|
||||
.args(["-in", csr_path.to_str().unwrap()])
|
||||
.args(["-CA", ca_cert_path.to_str().unwrap()])
|
||||
.args(["-CAkey", ca_key_path.to_str().unwrap()])
|
||||
.arg("-CAcreateserial")
|
||||
.args(["-out", cert_path.to_str().unwrap()])
|
||||
.args(["-days", "36500"])
|
||||
.args(["-copy_extensions", "copyall"])
|
||||
.output()
|
||||
.context("failed to sign CSR")?;
|
||||
if !keygen_output.status.success() {
|
||||
bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// Remove CSR file as it's not needed anymore.
|
||||
fs::remove_file(csr_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::num::NonZeroU64;
|
||||
@@ -15,22 +14,20 @@ use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use anyhow::{Context, bail};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use postgres_connection::{PgConnectionConfig, parse_host_port};
|
||||
use reqwest::Certificate;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::NodeId;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf};
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
use crate::background_process;
|
||||
use crate::local_env::{LocalEnv, NeonLocalInitPageserverConf, PageServerConf};
|
||||
|
||||
/// Directory within .neon which will be used by default for LocalFs remote storage.
|
||||
pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
|
||||
@@ -53,12 +50,29 @@ impl PageServerNode {
|
||||
let (host, port) =
|
||||
parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let port = port.unwrap_or(5432);
|
||||
|
||||
let ssl_ca_cert = env.ssl_ca_cert_path().map(|ssl_ca_file| {
|
||||
let buf = std::fs::read(ssl_ca_file).expect("SSL root CA file should exist");
|
||||
Certificate::from_pem(&buf).expect("CA certificate should be valid")
|
||||
});
|
||||
|
||||
let endpoint = if env.storage_controller.use_https_pageserver_api {
|
||||
format!(
|
||||
"https://{}",
|
||||
conf.listen_https_addr.as_ref().expect(
|
||||
"listen https address should be specified if use_https_pageserver_api is on"
|
||||
)
|
||||
)
|
||||
} else {
|
||||
format!("http://{}", conf.listen_http_addr)
|
||||
};
|
||||
|
||||
Self {
|
||||
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
|
||||
conf: conf.clone(),
|
||||
env: env.clone(),
|
||||
http_client: mgmt_api::Client::new(
|
||||
format!("http://{}", conf.listen_http_addr),
|
||||
endpoint,
|
||||
{
|
||||
match conf.http_auth_type {
|
||||
AuthType::Trust => None,
|
||||
@@ -69,7 +83,9 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
.as_deref(),
|
||||
),
|
||||
ssl_ca_cert,
|
||||
)
|
||||
.expect("Client constructs with no errors"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,7 +97,11 @@ impl PageServerNode {
|
||||
&self,
|
||||
conf: NeonLocalInitPageserverConf,
|
||||
) -> anyhow::Result<toml_edit::DocumentMut> {
|
||||
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
|
||||
assert_eq!(
|
||||
&PageServerConf::from(&conf),
|
||||
&self.conf,
|
||||
"during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"
|
||||
);
|
||||
|
||||
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
|
||||
|
||||
@@ -220,6 +240,13 @@ impl PageServerNode {
|
||||
.context("write identity toml")?;
|
||||
drop(identity_toml);
|
||||
|
||||
if self.env.generate_local_ssl_certs {
|
||||
self.env.generate_ssl_cert(
|
||||
datadir.join("server.crt").as_path(),
|
||||
datadir.join("server.key").as_path(),
|
||||
)?;
|
||||
}
|
||||
|
||||
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
|
||||
|
||||
// Write metadata file, used by pageserver on startup to register itself with
|
||||
@@ -230,6 +257,15 @@ impl PageServerNode {
|
||||
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||
let http_port = http_port.unwrap_or(9898);
|
||||
|
||||
let https_port = match self.conf.listen_https_addr.as_ref() {
|
||||
Some(https_addr) => {
|
||||
let (_https_host, https_port) =
|
||||
parse_host_port(https_addr).expect("Unable to parse listen_https_addr");
|
||||
Some(https_port.unwrap_or(9899))
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
// in case the pageserver-side structure is edited, and reflects the real life
|
||||
// situation: the metadata is written by some other script.
|
||||
@@ -240,6 +276,7 @@ impl PageServerNode {
|
||||
postgres_port: self.pg_connection_config.port(),
|
||||
http_host: "localhost".to_string(),
|
||||
http_port,
|
||||
https_port,
|
||||
other: HashMap::from([(
|
||||
"availability_zone_id".to_string(),
|
||||
serde_json::json!(az_id),
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
|
||||
///
|
||||
/// Module for parsing postgresql.conf file.
|
||||
///
|
||||
@@ -6,8 +9,6 @@
|
||||
/// funny stuff like include-directives or funny escaping.
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
|
||||
/// In-memory representation of a postgresql.conf file
|
||||
#[derive(Default, Debug)]
|
||||
|
||||
@@ -14,18 +14,15 @@ use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use http_utils::error::HttpErrorBody;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
|
||||
use http_utils::error::HttpErrorBody;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{
|
||||
background_process,
|
||||
local_env::{LocalEnv, SafekeeperConf},
|
||||
};
|
||||
use crate::background_process;
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
|
||||
@@ -1,44 +1,36 @@
|
||||
use crate::{
|
||||
background_process,
|
||||
local_env::{LocalEnv, NeonStorageControllerConf},
|
||||
};
|
||||
use std::ffi::OsStr;
|
||||
use std::fs;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::process::ExitStatus;
|
||||
use std::str::FromStr;
|
||||
use std::sync::OnceLock;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hyper0::Uri;
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse,
|
||||
},
|
||||
models::{
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantLocateResponse,
|
||||
};
|
||||
use pageserver_api::models::{TimelineCreateRequest, TimelineInfo};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{
|
||||
ffi::OsStr,
|
||||
fs,
|
||||
net::SocketAddr,
|
||||
path::PathBuf,
|
||||
process::ExitStatus,
|
||||
str::FromStr,
|
||||
sync::OnceLock,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
use utils::auth::{Claims, Scope, encode_from_key_file};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
use whoami::username;
|
||||
|
||||
use crate::background_process;
|
||||
use crate::local_env::{LocalEnv, NeonStorageControllerConf};
|
||||
|
||||
pub struct StorageController {
|
||||
env: LocalEnv,
|
||||
private_key: Option<Vec<u8>>,
|
||||
@@ -96,7 +88,8 @@ pub struct AttachHookRequest {
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookResponse {
|
||||
pub gen: Option<u32>,
|
||||
#[serde(rename = "gen")]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -541,6 +534,14 @@ impl StorageController {
|
||||
args.push("--start-as-candidate".to_string());
|
||||
}
|
||||
|
||||
if self.config.use_https_pageserver_api {
|
||||
args.push("--use-https-pageserver-api".to_string());
|
||||
}
|
||||
|
||||
if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
|
||||
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
|
||||
}
|
||||
|
||||
if let Some(private_key) = &self.private_key {
|
||||
let claims = Claims::new(None, Scope::PageServerApi);
|
||||
let jwt_token =
|
||||
@@ -583,6 +584,10 @@ impl StorageController {
|
||||
self.env.base_data_dir.display()
|
||||
));
|
||||
|
||||
if self.config.timelines_onto_safekeepers {
|
||||
args.push("--timelines-onto-safekeepers".to_string());
|
||||
}
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&instance_dir,
|
||||
@@ -779,7 +784,7 @@ impl StorageController {
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(response.gen)
|
||||
Ok(response.generation)
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
@@ -829,41 +834,6 @@ impl StorageController {
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_migrate(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<TenantShardMigrateResponse> {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||
Some(TenantShardMigrateRequest {
|
||||
node_id,
|
||||
migration_config: None,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
|
||||
pub async fn tenant_split(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
new_shard_count: u8,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<TenantShardSplitResponse> {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||
Some(TenantShardSplitRequest {
|
||||
new_shard_count,
|
||||
new_stripe_size,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
||||
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
|
||||
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
|
||||
|
||||
@@ -1,34 +1,28 @@
|
||||
use futures::StreamExt;
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
str::FromStr,
|
||||
time::Duration,
|
||||
};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
|
||||
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
|
||||
ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy,
|
||||
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use pageserver_client::mgmt_api::{self};
|
||||
use reqwest::{Method, StatusCode, Url};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
AvailabilityZone, MigrationConfig, NodeAvailabilityWrapper, NodeConfigureRequest,
|
||||
NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse,
|
||||
PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
|
||||
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
|
||||
SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters,
|
||||
TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest,
|
||||
TenantShardSplitResponse,
|
||||
};
|
||||
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
|
||||
use pageserver_client::mgmt_api::{self};
|
||||
use reqwest::{Method, StatusCode, Url};
|
||||
use storage_controller_client::control_api::Client;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Command {
|
||||
@@ -119,6 +113,15 @@ enum Command {
|
||||
tenant_shard_id: TenantShardId,
|
||||
#[arg(long)]
|
||||
node: NodeId,
|
||||
#[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
|
||||
prewarm: bool,
|
||||
#[arg(long, default_value_t = false, action = clap::ArgAction::Set)]
|
||||
override_scheduler: bool,
|
||||
},
|
||||
/// Watch the location of a tenant shard evolve, e.g. while expecting it to migrate
|
||||
TenantShardWatch {
|
||||
#[arg(long)]
|
||||
tenant_shard_id: TenantShardId,
|
||||
},
|
||||
/// Migrate the secondary location for a tenant shard to a specific pageserver.
|
||||
TenantShardMigrateSecondary {
|
||||
@@ -276,6 +279,10 @@ struct Cli {
|
||||
/// a token with both scopes to use with this tool.
|
||||
jwt: Option<String>,
|
||||
|
||||
#[arg(long)]
|
||||
/// Trusted root CA certificate to use in https APIs.
|
||||
ssl_ca_file: Option<PathBuf>,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Command,
|
||||
}
|
||||
@@ -386,9 +393,17 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
|
||||
|
||||
let ssl_ca_cert = match &cli.ssl_ca_file {
|
||||
Some(ssl_ca_file) => {
|
||||
let buf = tokio::fs::read(ssl_ca_file).await?;
|
||||
Some(reqwest::Certificate::from_pem(&buf)?)
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let mut trimmed = cli.api.to_string();
|
||||
trimmed.pop();
|
||||
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
|
||||
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref(), ssl_ca_cert)?;
|
||||
|
||||
match cli.command {
|
||||
Command::NodeRegister {
|
||||
@@ -626,19 +641,43 @@ async fn main() -> anyhow::Result<()> {
|
||||
Command::TenantShardMigrate {
|
||||
tenant_shard_id,
|
||||
node,
|
||||
prewarm,
|
||||
override_scheduler,
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest {
|
||||
node_id: node,
|
||||
migration_config: None,
|
||||
let migration_config = MigrationConfig {
|
||||
prewarm,
|
||||
override_scheduler,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
storcon_client
|
||||
let req = TenantShardMigrateRequest {
|
||||
node_id: node,
|
||||
origin_node_id: None,
|
||||
migration_config,
|
||||
};
|
||||
|
||||
match storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
{
|
||||
Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) => {
|
||||
anyhow::bail!(
|
||||
"Migration to {node} rejected, may require `--force` ({}) ",
|
||||
msg
|
||||
);
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
Ok(_) => {}
|
||||
}
|
||||
|
||||
watch_tenant_shard(storcon_client, tenant_shard_id, Some(node)).await?;
|
||||
}
|
||||
Command::TenantShardWatch { tenant_shard_id } => {
|
||||
watch_tenant_shard(storcon_client, tenant_shard_id, None).await?;
|
||||
}
|
||||
Command::TenantShardMigrateSecondary {
|
||||
tenant_shard_id,
|
||||
@@ -646,7 +685,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest {
|
||||
node_id: node,
|
||||
migration_config: None,
|
||||
origin_node_id: None,
|
||||
migration_config: MigrationConfig::default(),
|
||||
};
|
||||
|
||||
storcon_client
|
||||
@@ -921,7 +961,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
Command::TenantDrop { tenant_id, unclean } => {
|
||||
if !unclean {
|
||||
anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed.")
|
||||
anyhow::bail!(
|
||||
"This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed."
|
||||
)
|
||||
}
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
@@ -933,7 +975,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
Command::NodeDrop { node_id, unclean } => {
|
||||
if !unclean {
|
||||
anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed.")
|
||||
anyhow::bail!(
|
||||
"This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed."
|
||||
)
|
||||
}
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
|
||||
@@ -1108,7 +1152,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
|
||||
Some(TenantShardMigrateRequest {
|
||||
node_id: mv.to,
|
||||
migration_config: None,
|
||||
origin_node_id: Some(mv.from),
|
||||
migration_config: MigrationConfig::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -1287,3 +1332,68 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
static WATCH_INTERVAL: Duration = Duration::from_secs(5);
|
||||
|
||||
async fn watch_tenant_shard(
|
||||
storcon_client: Client,
|
||||
tenant_shard_id: TenantShardId,
|
||||
until_migrated_to: Option<NodeId>,
|
||||
) -> anyhow::Result<()> {
|
||||
if let Some(until_migrated_to) = until_migrated_to {
|
||||
println!(
|
||||
"Waiting for tenant shard {} to be migrated to node {}",
|
||||
tenant_shard_id, until_migrated_to
|
||||
);
|
||||
}
|
||||
|
||||
loop {
|
||||
let desc = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{}", tenant_shard_id.tenant_id),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Output the current state of the tenant shard
|
||||
let shard = desc
|
||||
.shards
|
||||
.iter()
|
||||
.find(|s| s.tenant_shard_id == tenant_shard_id)
|
||||
.ok_or(anyhow::anyhow!("Tenant shard not found"))?;
|
||||
let summary = format!(
|
||||
"attached: {} secondary: {} {}",
|
||||
shard
|
||||
.node_attached
|
||||
.map(|n| format!("{}", n))
|
||||
.unwrap_or("none".to_string()),
|
||||
shard
|
||||
.node_secondary
|
||||
.iter()
|
||||
.map(|n| n.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(","),
|
||||
if shard.is_reconciling {
|
||||
"(reconciler active)"
|
||||
} else {
|
||||
"(reconciler idle)"
|
||||
}
|
||||
);
|
||||
println!("{}", summary);
|
||||
|
||||
// Maybe drop out if we finished migration
|
||||
if let Some(until_migrated_to) = until_migrated_to {
|
||||
if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling {
|
||||
println!(
|
||||
"Tenant shard {} is now on node {}",
|
||||
tenant_shard_id, until_migrated_to
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
tokio::time::sleep(WATCH_INTERVAL).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -27,6 +27,10 @@ yanked = "warn"
|
||||
id = "RUSTSEC-2023-0071"
|
||||
reason = "the marvin attack only affects private key decryption, not public key signature verification"
|
||||
|
||||
[[advisories.ignore]]
|
||||
id = "RUSTSEC-2024-0436"
|
||||
reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact."
|
||||
|
||||
# This section is considered when running `cargo deny check licenses`
|
||||
# More documentation for the licenses section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
|
||||
|
||||
@@ -186,7 +186,7 @@ services:
|
||||
|
||||
neon-test-extensions:
|
||||
profiles: ["test-extensions"]
|
||||
image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
|
||||
image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
|
||||
environment:
|
||||
- PGPASSWORD=cloud_admin
|
||||
entrypoint:
|
||||
|
||||
@@ -7,7 +7,7 @@ index f255fe6..0a0fa65 100644
|
||||
GENERATED_SCHEDULE_DEPS = $(TB_DIR)/all_tests $(TB_DIR)/exclude_tests
|
||||
REGRESS = --schedule $(TB_DIR)/run.sch # Set this again just to be safe
|
||||
-REGRESS_OPTS = --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
|
||||
+REGRESS_OPTS = --use-existing --dbname=pgtap_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
|
||||
+REGRESS_OPTS = --use-existing --dbname=contrib_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
|
||||
SETUP_SCH = test/schedule/main.sch # schedule to use for test setup; this can be forcibly changed by some targets!
|
||||
IGNORE_TESTS = $(notdir $(EXCLUDE_TEST_FILES:.sql=))
|
||||
PARALLEL_TESTS = $(filter-out $(IGNORE_TESTS),$(filter-out $(SERIAL_TESTS),$(ALL_TESTS)))
|
||||
|
||||
@@ -6,12 +6,16 @@ generate_id() {
|
||||
local -n resvar=$1
|
||||
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
|
||||
}
|
||||
if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then
|
||||
echo OLDTAG and NEWTAG must be defined
|
||||
echo "${OLD_COMPUTE_TAG}"
|
||||
echo "${NEW_COMPUTE_TAG}"
|
||||
echo "${TEST_EXTENSIONS_TAG}"
|
||||
if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then
|
||||
echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set
|
||||
exit 1
|
||||
fi
|
||||
export PG_VERSION=${PG_VERSION:-16}
|
||||
export PG_TEST_VERSION=${PG_VERSION}
|
||||
# Waits for compute node is ready
|
||||
function wait_for_ready {
|
||||
TIME=0
|
||||
while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
|
||||
@@ -23,11 +27,45 @@ function wait_for_ready {
|
||||
exit 2
|
||||
fi
|
||||
}
|
||||
# Creates extensions. Gets a string with space-separated extensions as a parameter
|
||||
function create_extensions() {
|
||||
for ext in ${1}; do
|
||||
docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
|
||||
done
|
||||
}
|
||||
# Creates a new timeline. Gets the parent ID and an extension name as parameters.
|
||||
# Saves the timeline ID in the variable EXT_TIMELINE
|
||||
function create_timeline() {
|
||||
generate_id new_timeline_id
|
||||
|
||||
PARAMS=(
|
||||
-sbf
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${1}\"}"
|
||||
"http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
EXT_TIMELINE[${2}]=${new_timeline_id}
|
||||
}
|
||||
# Checks if the timeline ID of the compute node is expected. Gets the timeline ID as a parameter
|
||||
function check_timeline() {
|
||||
TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
|
||||
if [ "${TID}" != "${1}" ]; then
|
||||
echo Timeline mismatch
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
# Restarts the compute node with the required compute tag and timeline.
|
||||
# Accepts the tag for the compute node and the timeline as parameters.
|
||||
function restart_compute() {
|
||||
docker compose down compute compute_is_ready
|
||||
COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready
|
||||
wait_for_ready
|
||||
check_timeline ${2}
|
||||
}
|
||||
declare -A EXT_TIMELINE
|
||||
EXTENSIONS='[
|
||||
{"extname": "plv8", "extdir": "plv8-src"},
|
||||
{"extname": "vector", "extdir": "pgvector-src"},
|
||||
@@ -47,7 +85,7 @@ EXTENSIONS='[
|
||||
{"extname": "pg_repack", "extdir": "pg_repack-src"}
|
||||
]'
|
||||
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
|
||||
TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
|
||||
COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
|
||||
wait_for_ready
|
||||
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
|
||||
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
|
||||
@@ -55,12 +93,14 @@ create_extensions "${EXTNAMES}"
|
||||
query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
|
||||
new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
|
||||
docker compose --profile test-extensions down
|
||||
TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
|
||||
COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
|
||||
wait_for_ready
|
||||
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
|
||||
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
|
||||
docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression"
|
||||
docker compose exec neon-test-extensions psql -d pgtap_regression -c "CREATE EXTENSION pgtap"
|
||||
tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id")
|
||||
EXT_TIMELINE["main"]=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
|
||||
create_timeline "${EXT_TIMELINE["main"]}" init
|
||||
restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE["init"]}"
|
||||
create_extensions "${EXTNAMES}"
|
||||
if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then
|
||||
exts="${EXTNAMES}"
|
||||
@@ -71,29 +111,13 @@ fi
|
||||
if [ -z "${exts}" ]; then
|
||||
echo "No extensions were upgraded"
|
||||
else
|
||||
tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id")
|
||||
timeline_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
|
||||
for ext in ${exts}; do
|
||||
echo Testing ${ext}...
|
||||
create_timeline "${EXT_TIMELINE["main"]}" ${ext}
|
||||
EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir')
|
||||
generate_id new_timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${timeline_id}\"}"
|
||||
"http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready
|
||||
COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready
|
||||
wait_for_ready
|
||||
TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
|
||||
if [ ${TID} != ${new_timeline_id} ]; then
|
||||
echo Timeline mismatch
|
||||
exit 1
|
||||
fi
|
||||
restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}"
|
||||
docker compose exec neon-test-extensions psql -d contrib_regression -c "CREATE EXTENSION ${ext} CASCADE"
|
||||
restart_compute "${NEW_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}"
|
||||
docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
|
||||
if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then
|
||||
docker compose exec neon-test-extensions cat /ext-src/${EXTDIR}/regression.diffs
|
||||
|
||||
201
docs/rfcs/041-rel-sparse-keyspace.md
Normal file
201
docs/rfcs/041-rel-sparse-keyspace.md
Normal file
@@ -0,0 +1,201 @@
|
||||
# Sparse Keyspace for Relation Directories
|
||||
|
||||
## Summary
|
||||
|
||||
This is an RFC describing a new storage strategy for storing relation directories.
|
||||
|
||||
## Motivation
|
||||
|
||||
Postgres maintains a directory structure for databases and relations. In Neon, we store these information
|
||||
by serializing the directory data in a single key (see `pgdatadir_mapping.rs`).
|
||||
|
||||
```rust
|
||||
// DbDir:
|
||||
// 00 00000000 00000000 00000000 00 00000000
|
||||
|
||||
// RelDir:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
|
||||
```
|
||||
|
||||
We have a dedicated structure on the ingestion path to serialize the relation directory into this single key.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
pub(crate) struct RelDirectory {
|
||||
// Set of relations that exist. (relfilenode, forknum)
|
||||
//
|
||||
// TODO: Store it as a btree or radix tree or something else that spans multiple
|
||||
// key-value pairs, if you have a lot of relations
|
||||
pub(crate) rels: HashSet<(Oid, u8)>,
|
||||
}
|
||||
```
|
||||
|
||||
The current codebase has the following three access patterns for the relation directory.
|
||||
|
||||
1. Check if a relation exists.
|
||||
2. List all relations.
|
||||
3. Create/drop a relation.
|
||||
|
||||
For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the
|
||||
hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get
|
||||
and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back.
|
||||
|
||||
If we have 100k relations in a database, we would have a 100k-large hash set. Then, every
|
||||
relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the
|
||||
relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path,
|
||||
we would have to deserialize this super big 100k-large key before checking if a single relation exists.
|
||||
|
||||
In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how
|
||||
to seamlessly migrate users to use the new keyspace.
|
||||
|
||||
The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316).
|
||||
|
||||
## Key Mapping
|
||||
|
||||
We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in
|
||||
[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>`
|
||||
for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`),
|
||||
into the key.
|
||||
|
||||
```plain
|
||||
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted
|
||||
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists
|
||||
```
|
||||
|
||||
Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be
|
||||
implemented as follows.
|
||||
|
||||
1. Check if a relation exists: check if the key maps to "exists".
|
||||
2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key.
|
||||
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will
|
||||
be removed during image layer generation upon compaction.
|
||||
|
||||
Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum.
|
||||
The mapping is implemented as `rel_tag_sparse_key` in the PoC patch.
|
||||
|
||||
## Changes to Sparse Keyspace
|
||||
|
||||
Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir
|
||||
information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs
|
||||
to be updated accordingly to accommodate such "inherited sparse keys". This is done in
|
||||
[PR#10313](https://github.com/neondatabase/neon/pull/10313).
|
||||
|
||||
## Coexistence of the Old and New Keyspaces
|
||||
|
||||
Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the
|
||||
ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read
|
||||
path needs to combine the data from both keyspaces.
|
||||
|
||||
Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the
|
||||
new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration
|
||||
process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the
|
||||
migration can happen seamlessly and imposes no potential downtime for the user.
|
||||
|
||||
With the coexistence assumption, the 3 reldir operations will be implemented as follows:
|
||||
|
||||
1. Check if a relation exists
|
||||
- Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly
|
||||
return it to the user.
|
||||
- Otherwise, deserialize the old reldir key and get the result.
|
||||
2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key.
|
||||
Combine them to obtain the final result.
|
||||
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace.
|
||||
- We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check.
|
||||
- For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace.
|
||||
- For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key,
|
||||
remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace.
|
||||
- The delete tombstone will be removed during image layer generation upon compaction.
|
||||
|
||||
This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total
|
||||
amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction.
|
||||
There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal
|
||||
with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives
|
||||
us `O(1)` complexity after fully opt-in the sparse keyspace.
|
||||
|
||||
The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible
|
||||
to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN.
|
||||
|
||||
We will introduce a config item and an index_part record to record the current status of the migration process.
|
||||
|
||||
- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace.
|
||||
- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace.
|
||||
|
||||
If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update
|
||||
`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to
|
||||
`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still
|
||||
read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only:
|
||||
once v2 is enabled, the user cannot go back to v1.
|
||||
|
||||
## Next Steps
|
||||
|
||||
### Full Migration
|
||||
|
||||
This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and
|
||||
v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this
|
||||
code path, we must ensure the timeline has no old reldir data.
|
||||
|
||||
We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces:
|
||||
the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while
|
||||
copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in
|
||||
the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers`
|
||||
process discovers the following keys at this LSN.
|
||||
|
||||
```plain
|
||||
db1/reldir_key -> (table 1, table 2, table 3)
|
||||
...db1 rel keys
|
||||
db2/reldir_key -> (table 4, table 5, table 6)
|
||||
...db2 rel keys
|
||||
sparse_reldir_db2_table7 -> exists
|
||||
sparse_reldir_db1_table8 -> deleted
|
||||
```
|
||||
|
||||
It will generate the following keys:
|
||||
|
||||
```plain
|
||||
db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`.
|
||||
...db1 rel keys
|
||||
db2/reldir_key -> ()
|
||||
...db2 rel keys
|
||||
|
||||
-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180
|
||||
sparse_reldir_db1_table1 -> exists
|
||||
sparse_reldir_db1_table2 -> exists
|
||||
sparse_reldir_db1_table3 -> exists
|
||||
sparse_reldir_db2_table4 -> exists
|
||||
sparse_reldir_db2_table5 -> exists
|
||||
sparse_reldir_db2_table6 -> exists
|
||||
sparse_reldir_db2_table7 -> exists
|
||||
-- end image layer for the sparse keyspace at sparse_reldir_prefix+1
|
||||
|
||||
# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace.
|
||||
# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there
|
||||
# are no correctness issue.
|
||||
```
|
||||
|
||||
We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before
|
||||
we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images
|
||||
above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or
|
||||
in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to
|
||||
`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we
|
||||
don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers.
|
||||
|
||||
The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code.
|
||||
|
||||
### Consolidate Relation Size Keys
|
||||
|
||||
We have relsize at the end of all relation nodes.
|
||||
|
||||
```plain
|
||||
// RelSize:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
|
||||
```
|
||||
|
||||
This means that computing logical size requires us to do several single-key gets across the keyspace,
|
||||
potentially requiring downloading many layer files. We could consolidate them into a single
|
||||
keyspace, improving logical size calculation performance.
|
||||
|
||||
### Migrate DBDir Keys
|
||||
|
||||
We assume the number of databases created by the users will be small, and therefore, the current way
|
||||
of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into
|
||||
the sparse keyspace to support large amount of databases.
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "compute_api"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
edition = "2024"
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user