Compare commits

..

1 Commits

Author SHA1 Message Date
Heikki Linnakangas
a9edb99749 Use neonvm-daemon's control socket for swap resizing and setting disk quota
ATTENTION: This requires a new-enough neonvm-runner image, which
contains the new interface in neonvm-daemon. The deployment plan is to
update neonvm-runner everywhere first, and only then merge this PR.
2025-02-22 23:58:14 +02:00
571 changed files with 9283 additions and 14915 deletions

View File

@@ -14,7 +14,6 @@
!compute/
!compute_tools/
!control_plane/
!docker-compose/ext-src
!libs/
!pageserver/
!pgxn/

View File

@@ -32,4 +32,3 @@ config-variables:
- NEON_DEV_AWS_ACCOUNT_ID
- NEON_PROD_AWS_ACCOUNT_ID
- AWS_ECR_REGION
- BENCHMARK_LARGE_OLTP_PROJECTID

View File

@@ -38,11 +38,9 @@ runs:
#
- name: Set variables
shell: bash -euxo pipefail {0}
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
BUCKET: neon-github-public-dev
run: |
if [ -n "${PR_NUMBER}" ]; then
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
if [ "${PR_NUMBER}" != "null" ]; then
BRANCH_OR_PR=pr-${PR_NUMBER}
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
[ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
@@ -61,6 +59,8 @@ runs:
echo "LOCK_FILE=${LOCK_FILE}" >> $GITHUB_ENV
echo "WORKDIR=${WORKDIR}" >> $GITHUB_ENV
echo "BUCKET=${BUCKET}" >> $GITHUB_ENV
env:
BUCKET: neon-github-public-dev
# TODO: We can replace with a special docker image with Java and Allure pre-installed
- uses: actions/setup-java@v4
@@ -80,8 +80,8 @@ runs:
rm -f ${ALLURE_ZIP}
fi
env:
ALLURE_VERSION: 2.32.2
ALLURE_ZIP_SHA256: 3f28885e2118f6317c92f667eaddcc6491400af1fb9773c1f3797a5fa5174953
ALLURE_VERSION: 2.27.0
ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
- uses: aws-actions/configure-aws-credentials@v4
if: ${{ !cancelled() }}

View File

@@ -18,11 +18,9 @@ runs:
steps:
- name: Set variables
shell: bash -euxo pipefail {0}
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
REPORT_DIR: ${{ inputs.report-dir }}
run: |
if [ -n "${PR_NUMBER}" ]; then
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
if [ "${PR_NUMBER}" != "null" ]; then
BRANCH_OR_PR=pr-${PR_NUMBER}
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
[ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
@@ -34,6 +32,8 @@ runs:
echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
echo "REPORT_DIR=${REPORT_DIR}" >> $GITHUB_ENV
env:
REPORT_DIR: ${{ inputs.report-dir }}
- uses: aws-actions/configure-aws-credentials@v4
if: ${{ !cancelled() }}

View File

@@ -84,13 +84,7 @@ runs:
--header "Authorization: Bearer ${API_KEY}"
)
role_name=$(echo "$roles" | jq --raw-output '
(.roles | map(select(.protected == false))) as $roles |
if any($roles[]; .name == "neondb_owner")
then "neondb_owner"
else $roles[0].name
end
')
role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
echo "role_name=${role_name}" >> $GITHUB_OUTPUT
env:
API_HOST: ${{ inputs.api_host }}
@@ -113,13 +107,13 @@ runs:
)
if [ -z "${reset_password}" ]; then
sleep $i
sleep 1
continue
fi
password=$(echo $reset_password | jq --raw-output '.role.password')
if [ "${password}" == "null" ]; then
sleep $i # increasing backoff
sleep 1
continue
fi

View File

@@ -44,11 +44,6 @@ inputs:
description: 'Postgres version to use for tests'
required: false
default: 'v16'
sanitizers:
description: 'enabled or disabled'
required: false
default: 'disabled'
type: string
benchmark_durations:
description: 'benchmark durations JSON'
required: false
@@ -64,7 +59,7 @@ runs:
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
path: /tmp/neon
aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
@@ -117,7 +112,6 @@ runs:
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
RERUN_FAILED: ${{ inputs.rerun_failed }}
PG_VERSION: ${{ inputs.pg_version }}
SANITIZERS: ${{ inputs.sanitizers }}
shell: bash -euxo pipefail {0}
run: |
# PLATFORM will be embedded in the perf test report
@@ -242,5 +236,5 @@ runs:
uses: ./.github/actions/allure-report-store
with:
report-dir: /tmp/test_output/allure/results
unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}-${{ runner.arch }}
unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}

View File

@@ -1,25 +0,0 @@
# Expects response from https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#list-releases as input,
# with tag names `release` for storage, `release-compute` for compute and `release-proxy` for proxy releases.
# Extract only the `tag_name` field from each release object
[ .[].tag_name ]
# Transform each tag name into a structured object using regex capture
| reduce map(
capture("^(?<full>release(-(?<component>proxy|compute))?-(?<version>\\d+))$")
| {
component: (.component // "storage"), # Default to "storage" if no component is specified
version: (.version | tonumber), # Convert the version number to an integer
full: .full # Store the full tag name for final output
}
)[] as $entry # Loop over the transformed list
# Accumulate the latest (highest-numbered) version for each component
({};
.[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end))
# Convert the resulting object into an array of formatted strings
| to_entries
| map("\(.key)=\(.value.full)")
# Output each string separately
| .[]

View File

@@ -280,7 +280,7 @@ jobs:
- name: Upload Neon artifact
uses: ./.github/actions/upload
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
path: /tmp/neon
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
@@ -337,7 +337,7 @@ jobs:
- name: Pytest regression tests
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
uses: ./.github/actions/run-python-test-set
timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }}
timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }}
with:
build_type: ${{ inputs.build-type }}
test_selection: regress
@@ -347,7 +347,6 @@ jobs:
real_s3_region: eu-central-1
rerun_failed: true
pg_version: ${{ matrix.pg_version }}
sanitizers: ${{ inputs.sanitizers }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
# `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
# Attempt to stop tests gracefully to generate test reports
@@ -360,6 +359,7 @@ jobs:
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
SANITIZERS: ${{ inputs.sanitizers }}
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540

View File

@@ -1,103 +0,0 @@
name: Generate run metadata
on:
workflow_call:
inputs:
github-event-name:
type: string
required: true
outputs:
build-tag:
description: "Tag for the current workflow run"
value: ${{ jobs.tags.outputs.build-tag }}
previous-storage-release:
description: "Tag of the last storage release"
value: ${{ jobs.tags.outputs.storage }}
previous-proxy-release:
description: "Tag of the last proxy release"
value: ${{ jobs.tags.outputs.proxy }}
previous-compute-release:
description: "Tag of the last compute release"
value: ${{ jobs.tags.outputs.compute }}
run-kind:
description: "The kind of run we're currently in. Will be one of `pr`, `push-main`, `storage-rc`, `storage-release`, `proxy-rc`, `proxy-release`, `compute-rc`, `compute-release` or `merge_queue`"
value: ${{ jobs.tags.outputs.run-kind }}
permissions: {}
jobs:
tags:
runs-on: ubuntu-22.04
outputs:
build-tag: ${{ steps.build-tag.outputs.tag }}
compute: ${{ steps.previous-releases.outputs.compute }}
proxy: ${{ steps.previous-releases.outputs.proxy }}
storage: ${{ steps.previous-releases.outputs.storage }}
run-kind: ${{ steps.run-kind.outputs.run-kind }}
permissions:
contents: read
steps:
# Need `fetch-depth: 0` to count the number of commits in the branch
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Get run kind
id: run-kind
env:
RUN_KIND: >-
${{
false
|| (inputs.github-event-name == 'push' && github.ref_name == 'main') && 'push-main'
|| (inputs.github-event-name == 'push' && github.ref_name == 'release') && 'storage-release'
|| (inputs.github-event-name == 'push' && github.ref_name == 'release-compute') && 'compute-release'
|| (inputs.github-event-name == 'push' && github.ref_name == 'release-proxy') && 'proxy-release'
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release') && 'storage-rc-pr'
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr'
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr'
|| (inputs.github-event-name == 'pull_request') && 'pr'
|| 'unknown'
}}
run: |
echo "run-kind=$RUN_KIND" | tee -a $GITHUB_OUTPUT
- name: Get build tag
id: build-tag
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }}
CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
RUN_KIND: ${{ steps.run-kind.outputs.run-kind }}
run: |
case $RUN_KIND in
push-main)
echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
;;
storage-release)
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
;;
proxy-release)
echo "tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
;;
compute-release)
echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
;;
pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr)
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
;;
*)
echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!"
exit 1
esac
- name: Get the previous release-tags
id: previous-releases
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh api --paginate \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/${GITHUB_REPOSITORY}/releases" \
| jq -f .github/scripts/previous-releases.jq -r \
| tee -a "${GITHUB_OUTPUT}"

View File

@@ -11,12 +11,8 @@ on:
description: AWS region to log in to. Required when pushing to ECR.
required: false
type: string
aws-account-id:
description: AWS account ID to log in to for pushing to ECR. Required when pushing to ECR.
required: false
type: string
aws-role-to-assume:
description: AWS role to assume to for pushing to ECR. Required when pushing to ECR.
aws-account-ids:
description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR.
required: false
type: string
azure-client-id:
@@ -35,6 +31,16 @@ on:
description: ACR registry name. Required when pushing to ACR.
required: false
type: string
secrets:
docker-hub-username:
description: Docker Hub username. Required when pushing to Docker Hub.
required: false
docker-hub-password:
description: Docker Hub password. Required when pushing to Docker Hub.
required: false
aws-role-to-assume:
description: AWS role to assume. Required when pushing to ECR.
required: false
permissions: {}
@@ -47,11 +53,10 @@ jobs:
runs-on: ubuntu-22.04
permissions:
id-token: write # Required for aws/azure login
packages: write # required for pushing to GHCR
steps:
- uses: actions/checkout@v4
with:
sparse-checkout: .github/scripts/push_with_image_map.py
sparse-checkout: scripts/push_with_image_map.py
sparse-checkout-cone-mode: false
- name: Print image-map
@@ -62,14 +67,14 @@ jobs:
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: "${{ inputs.aws-region }}"
role-to-assume: "arn:aws:iam::${{ inputs.aws-account-id }}:role/${{ inputs.aws-role-to-assume }}"
role-to-assume: "${{ secrets.aws-role-to-assume }}"
role-duration-seconds: 3600
- name: Login to ECR
if: contains(inputs.image-map, 'amazonaws.com/')
uses: aws-actions/amazon-ecr-login@v2
with:
registries: "${{ inputs.aws-account-id }}"
registries: "${{ inputs.aws-account-ids }}"
- name: Configure Azure credentials
if: contains(inputs.image-map, 'azurecr.io/')
@@ -84,21 +89,13 @@ jobs:
run: |
az acr login --name=${{ inputs.acr-registry-name }}
- name: Login to GHCR
if: contains(inputs.image-map, 'ghcr.io/')
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
username: ${{ secrets.docker-hub-username }}
password: ${{ secrets.docker-hub-password }}
- name: Copy docker images to target registries
run: python3 .github/scripts/push_with_image_map.py
run: python scripts/push_with_image_map.py
env:
IMAGE_MAP: ${{ inputs.image-map }}

View File

@@ -140,9 +140,6 @@ jobs:
--ignore test_runner/performance/test_logical_replication.py
--ignore test_runner/performance/test_physical_replication.py
--ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
--ignore test_runner/performance/test_cumulative_statistics_persistence.py
--ignore test_runner/performance/test_perf_many_relations.py
--ignore test_runner/performance/test_perf_oltp_large_tenant.py
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -174,61 +171,6 @@ jobs:
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
cumstats-test:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 17
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-staging"
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Verify that cumulative statistics are preserved
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_cumulative_statistics_persistence.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 3600
pg_version: ${{ env.DEFAULT_PG_VERSION }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
replication-tests:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
permissions:
@@ -456,9 +398,6 @@ jobs:
runs-on: ${{ matrix.runner }}
container:
image: ${{ matrix.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
# Increase timeout to 8h, default timeout is 6h

View File

@@ -65,11 +65,38 @@ jobs:
token: ${{ secrets.GITHUB_TOKEN }}
filters: .github/file-filters.yaml
meta:
tag:
needs: [ check-permissions ]
uses: ./.github/workflows/_meta.yml
with:
github-event-name: ${{ github.event_name }}
runs-on: [ self-hosted, small ]
container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
# Need `fetch-depth: 0` to count the number of commits in the branch
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Get build tag
run: |
echo run:$GITHUB_RUN_ID
echo ref:$GITHUB_REF_NAME
echo rev:$(git rev-list --count HEAD)
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
fi
shell: bash
id: build-tag
build-build-tools-image:
needs: [ check-permissions ]
@@ -172,7 +199,7 @@ jobs:
secrets: inherit
build-and-test-locally:
needs: [ meta, build-build-tools-image ]
needs: [ tag, build-build-tools-image ]
strategy:
fail-fast: false
matrix:
@@ -186,7 +213,7 @@ jobs:
with:
arch: ${{ matrix.arch }}
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
build-tag: ${{ needs.meta.outputs.build-tag }}
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
# Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled.
@@ -470,24 +497,13 @@ jobs:
})
trigger-e2e-tests:
# Depends on jobs that can get skipped
if: >-
${{
(
!github.event.pull_request.draft
|| contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft')
|| contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind)
) && !failure() && !cancelled()
}}
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ]
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ]
uses: ./.github/workflows/trigger-e2e-tests.yml
with:
github-event-name: ${{ github.event_name }}
secrets: inherit
neon-image-arch:
needs: [ check-permissions, build-build-tools-image, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
needs: [ check-permissions, build-build-tools-image, tag ]
strategy:
matrix:
arch: [ x64, arm64 ]
@@ -523,7 +539,7 @@ jobs:
build-args: |
ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }}
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG=${{ needs.meta.outputs.build-tag }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm
DEBIAN_VERSION=bookworm
provenance: false
@@ -533,11 +549,10 @@ jobs:
cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }}
tags: |
neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }}
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }}
neon-image:
needs: [ neon-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
needs: [ neon-image-arch, tag ]
runs-on: ubuntu-22.04
permissions:
id-token: write # aws-actions/configure-aws-credentials
@@ -552,14 +567,13 @@ jobs:
- name: Create multi-arch image
run: |
docker buildx imagetools create -t neondatabase/neon:${{ needs.meta.outputs.build-tag }} \
-t neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \
neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \
neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64
docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
-t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64
compute-node-image-arch:
needs: [ check-permissions, build-build-tools-image, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
needs: [ check-permissions, build-build-tools-image, tag ]
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -617,7 +631,7 @@ jobs:
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
PG_VERSION=${{ matrix.version.pg }}
BUILD_TAG=${{ needs.meta.outputs.build-tag }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
DEBIAN_VERSION=${{ matrix.version.debian }}
provenance: false
@@ -627,7 +641,7 @@ jobs:
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
tags: |
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
- name: Build neon extensions test image
if: matrix.version.pg >= 'v16'
@@ -637,7 +651,7 @@ jobs:
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
PG_VERSION=${{ matrix.version.pg }}
BUILD_TAG=${{ needs.meta.outputs.build-tag }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
DEBIAN_VERSION=${{ matrix.version.debian }}
provenance: false
@@ -647,11 +661,10 @@ jobs:
target: extension-tests
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
tags: |
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
compute-node-image:
needs: [ compute-node-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
needs: [ compute-node-image-arch, tag ]
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -679,28 +692,27 @@ jobs:
- name: Create multi-arch compute-node image
run: |
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
-t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
- name: Create multi-arch neon-test-extensions image
if: matrix.version.pg >= 'v16'
run: |
docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
-t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
vm-compute-node-image-arch:
needs: [ check-permissions, meta, compute-node-image ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
runs-on: [ self-hosted, large ]
strategy:
fail-fast: false
matrix:
arch: [ amd64, arm64 ]
version:
# see the comment for `compute-node-image-arch` job
- pg: v14
debian: bullseye
- pg: v15
@@ -710,14 +722,14 @@ jobs:
- pg: v17
debian: bookworm
env:
VM_BUILDER_VERSION: v0.42.2
VM_BUILDER_VERSION: v0.37.1
steps:
- uses: actions/checkout@v4
- name: Downloading vm-builder
run: |
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
chmod +x vm-builder
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
@@ -730,50 +742,22 @@ jobs:
# it won't have the proper authentication (written at v0.6.0)
- name: Pulling compute-node image
run: |
docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}
docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
- name: Build vm image
run: |
./vm-builder \
-size=2G \
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \
-target-arch=linux/${{ matrix.arch }}
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
- name: Pushing vm-compute-node image
run: |
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }}
vm-compute-node-image:
needs: [ vm-compute-node-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ubuntu-22.04
strategy:
matrix:
version:
# see the comment for `compute-node-image-arch` job
- pg: v14
- pg: v15
- pg: v16
- pg: v17
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch compute-node image
run: |
docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
test-images:
needs: [ check-permissions, meta, neon-image, compute-node-image ]
# Depends on jobs that can get skipped
if: "!failure() && !cancelled()"
needs: [ check-permissions, tag, neon-image, compute-node-image ]
strategy:
fail-fast: false
matrix:
@@ -791,6 +775,17 @@ jobs:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Get the last compute release tag
id: get-last-compute-release-tag
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/${{ github.repository }}/releases")
echo tag=${tag} >> ${GITHUB_OUTPUT}
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
# Regular pageserver version string looks like
@@ -800,9 +795,8 @@ jobs:
# Ensure that we don't have bad versions.
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
run: |
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
echo "Pageserver version string: $pageserver_version"
@@ -819,24 +813,7 @@ jobs:
- name: Verify docker-compose example and test extensions
timeout-minutes: 20
env:
TAG: >-
${{
contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.previous-storage-release
|| needs.meta.outputs.build-tag
}}
COMPUTE_TAG: >-
${{
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.previous-compute-release
|| needs.meta.outputs.build-tag
}}
TEST_EXTENSIONS_TAG: >-
${{
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
&& 'latest'
|| needs.meta.outputs.build-tag
}}
TAG: ${{needs.tag.outputs.build-tag}}
TEST_VERSION_ONLY: ${{ matrix.pg_version }}
run: ./docker-compose/docker_compose_test.sh
@@ -848,17 +825,10 @@ jobs:
- name: Test extension upgrade
timeout-minutes: 20
if: ${{ contains(fromJSON('["pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
if: ${{ needs.tag.outputs.build-tag == github.run_id }}
env:
TAG: >-
${{
false
|| needs.meta.outputs.run-kind == 'pr' && needs.meta.outputs.build-tag
|| needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release
}}
TEST_EXTENSIONS_TAG: ${{ needs.meta.outputs.previous-compute-release }}
NEW_COMPUTE_TAG: ${{ needs.meta.outputs.build-tag }}
OLD_COMPUTE_TAG: ${{ needs.meta.outputs.previous-compute-release }}
NEWTAG: ${{ needs.tag.outputs.build-tag }}
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
run: ./docker-compose/test_extensions_upgrade.sh
- name: Print logs and clean up
@@ -868,7 +838,7 @@ jobs:
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
generate-image-maps:
needs: [ meta ]
needs: [ tag ]
runs-on: ubuntu-22.04
outputs:
neon-dev: ${{ steps.generate.outputs.neon-dev }}
@@ -878,14 +848,14 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
sparse-checkout: .github/scripts/generate_image_maps.py
sparse-checkout: scripts/generate_image_maps.py
sparse-checkout-cone-mode: false
- name: Generate Image Maps
id: generate
run: python3 .github/scripts/generate_image_maps.py
run: python scripts/generate_image_maps.py
env:
BUILD_TAG: "${{ needs.meta.outputs.build-tag }}"
BUILD_TAG: "${{ needs.tag.outputs.build-tag }}"
BRANCH: "${{ github.ref_name }}"
DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
@@ -894,95 +864,88 @@ jobs:
AWS_REGION: "${{ vars.AWS_ECR_REGION }}"
push-neon-image-dev:
needs: [ meta, generate-image-maps, neon-image ]
if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }}
needs: [ generate-image-maps, neon-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
permissions:
id-token: write # Required for aws/azure login
packages: write # required for pushing to GHCR
with:
image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}'
aws-region: ${{ vars.AWS_ECR_REGION }}
aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
aws-role-to-assume: "gha-oidc-neon-admin"
aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
secrets: inherit
secrets:
aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
push-compute-image-dev:
needs: [ meta, generate-image-maps, vm-compute-node-image ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
needs: [ generate-image-maps, vm-compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
permissions:
id-token: write # Required for aws/azure login
packages: write # required for pushing to GHCR
with:
image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}'
aws-region: ${{ vars.AWS_ECR_REGION }}
aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
aws-role-to-assume: "gha-oidc-neon-admin"
aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
secrets: inherit
secrets:
aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
push-neon-image-prod:
needs: [ meta, generate-image-maps, neon-image, test-images ]
# Depends on jobs that can get skipped
if: ${{ !failure() && !cancelled() && contains(fromJSON('["storage-release", "proxy-release"]'), needs.meta.outputs.run-kind) }}
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
needs: [ generate-image-maps, neon-image, test-images ]
uses: ./.github/workflows/_push-to-container-registry.yml
permissions:
id-token: write # Required for aws/azure login
packages: write # required for pushing to GHCR
with:
image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}'
aws-region: ${{ vars.AWS_ECR_REGION }}
aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
aws-role-to-assume: "gha-oidc-neon-admin"
aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
secrets: inherit
secrets:
aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
push-compute-image-prod:
needs: [ meta, generate-image-maps, vm-compute-node-image, test-images ]
# Depends on jobs that can get skipped
if: ${{ !failure() && !cancelled() && needs.meta.outputs.run-kind == 'compute-release' }}
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
needs: [ generate-image-maps, vm-compute-node-image, test-images ]
uses: ./.github/workflows/_push-to-container-registry.yml
permissions:
id-token: write # Required for aws/azure login
packages: write # required for pushing to GHCR
with:
image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}'
aws-region: ${{ vars.AWS_ECR_REGION }}
aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
aws-role-to-assume: "gha-oidc-neon-admin"
aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
secrets: inherit
secrets:
aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# This is a bit of a special case so we're not using a generated image map.
add-latest-tag-to-neon-extensions-test-image:
if: github.ref_name == 'main'
needs: [ meta, compute-node-image ]
needs: [ tag, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
}
secrets: inherit
secrets:
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
needs: [ check-permissions, tag ]
runs-on: ubuntu-22.04
permissions:
id-token: write # aws-actions/configure-aws-credentials
@@ -1017,7 +980,7 @@ jobs:
\"ci_job_name\": \"build-and-upload-extensions\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"compute_image_tag\": \"${{ needs.meta.outputs.build-tag }}\",
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
\"remote_branch_name\": \"${{ github.ref_name }}\"
}
}"
@@ -1061,9 +1024,9 @@ jobs:
exit 1
deploy:
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -1074,103 +1037,108 @@ jobs:
- uses: actions/checkout@v4
- name: Create git tag and GitHub release
if: ${{ contains(fromJSON('["storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) }}
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
uses: actions/github-script@v7
env:
TAG: "${{ needs.meta.outputs.build-tag }}"
BRANCH: "${{ github.ref_name }}"
PREVIOUS_RELEASE: >-
${{
false
|| needs.meta.outputs.run-kind == 'storage-release' && needs.meta.outputs.previous-storage-release
|| needs.meta.outputs.run-kind == 'proxy-release' && needs.meta.outputs.previous-proxy-release
|| needs.meta.outputs.run-kind == 'compute-release' && needs.meta.outputs.previous-compute-release
|| 'unknown'
}}
with:
retries: 5
script: |
const { TAG, BRANCH, PREVIOUS_RELEASE } = process.env
const tag = "${{ needs.tag.outputs.build-tag }}";
const branch = "${{ github.ref_name }}";
try {
const existingRef = await github.rest.git.getRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: `tags/${TAG}`,
ref: `tags/${tag}`,
});
if (existingRef.data.object.sha !== context.sha) {
throw new Error(`Tag ${TAG} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
}
console.log(`Tag ${TAG} already exists and points to ${context.sha} as expected.`);
console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`);
} catch (error) {
if (error.status !== 404) {
throw error;
}
console.log(`Tag ${TAG} does not exist. Creating it...`);
console.log(`Tag ${tag} does not exist. Creating it...`);
await github.rest.git.createRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: `refs/tags/${TAG}`,
ref: `refs/tags/${tag}`,
sha: context.sha,
});
console.log(`Tag ${TAG} created successfully.`);
console.log(`Tag ${tag} created successfully.`);
}
try {
const existingRelease = await github.rest.repos.getReleaseByTag({
owner: context.repo.owner,
repo: context.repo.repo,
tag: TAG,
tag: tag,
});
console.log(`Release for tag ${TAG} already exists (ID: ${existingRelease.data.id}).`);
console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`);
} catch (error) {
if (error.status !== 404) {
throw error;
}
console.log(`Release for tag ${TAG} does not exist. Creating it...`);
console.log(`Release for tag ${tag} does not exist. Creating it...`);
// Find the PR number using the commit SHA
const pullRequests = await github.rest.pulls.list({
owner: context.repo.owner,
repo: context.repo.repo,
state: 'closed',
base: BRANCH,
base: branch,
});
const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha);
const prNumber = pr ? pr.number : null;
// Find the previous release on the branch
const releases = await github.rest.repos.listReleases({
owner: context.repo.owner,
repo: context.repo.repo,
per_page: 100,
});
const branchReleases = releases.data
.filter((release) => {
const regex = new RegExp(`^${branch}-\\d+$`);
return regex.test(release.tag_name) && !release.draft && !release.prerelease;
})
.sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null;
const releaseNotes = [
prNumber
? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.`
: 'Release PR not found.',
`Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${PREVIOUS_RELEASE}...${TAG}.`
previousTag
? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.`
: `No previous release found on branch ${branch}.`,
].join('\n\n');
await github.rest.repos.createRelease({
owner: context.repo.owner,
repo: context.repo.repo,
tag_name: TAG,
tag_name: tag,
body: releaseNotes,
});
console.log(`Release for tag ${TAG} created successfully.`);
console.log(`Release for tag ${tag} created successfully.`);
}
- name: Trigger deploy workflow
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
RUN_KIND: ${{ needs.meta.outputs.run-kind }}
run: |
case ${RUN_KIND} in
push-main)
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.meta.outputs.build-tag}} -f deployPreprodRegion=false
;;
storage-release)
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
-f deployPgSniRouter=false \
-f deployProxy=false \
@@ -1178,7 +1146,7 @@ jobs:
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}} \
-f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
@@ -1186,9 +1154,8 @@ jobs:
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}}
;;
proxy-release)
-f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
-f deployPgSniRouter=true \
-f deployProxy=true \
@@ -1196,7 +1163,7 @@ jobs:
-f deployStorageBroker=false \
-f deployStorageController=false \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}} \
-f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
@@ -1206,16 +1173,13 @@ jobs:
-f deployProxyScram=true \
-f deployProxyAuthBroker=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}}
;;
compute-release)
gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.meta.outputs.build-tag}}
;;
*)
echo "RUN_KIND (value '${RUN_KIND}') is not set to either 'push-main', 'storage-release', 'proxy-release' or 'compute-release'"
-f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}}
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'"
exit 1
;;
esac
fi
notify-storage-release-deploy-failure:
needs: [ deploy ]
@@ -1240,7 +1204,7 @@ jobs:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
contents: read
# `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
# `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
if: github.ref_name == 'release' && !failure() && !cancelled()
runs-on: ubuntu-22.04
@@ -1330,8 +1294,7 @@ jobs:
pin-build-tools-image:
needs: [ build-build-tools-image, test-images, build-and-test-locally ]
# `!failure() && !cancelled()` is required because the job (transitively) depends on jobs that can be skipped
if: github.ref_name == 'main' && !failure() && !cancelled()
if: github.ref_name == 'main'
uses: ./.github/workflows/pin-build-tools-image.yml
with:
from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
@@ -1350,7 +1313,6 @@ jobs:
# Format `needs` differently to make the list more readable.
# Usually we do `needs: [...]`
needs:
- meta
- build-and-test-locally
- check-codestyle-python
- check-codestyle-rust
@@ -1374,7 +1336,7 @@ jobs:
|| needs.check-codestyle-python.result == 'skipped'
|| needs.check-codestyle-rust.result == 'skipped'
|| needs.files-changed.result == 'skipped'
|| (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|| (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind))
|| needs.push-compute-image-dev.result == 'skipped'
|| needs.push-neon-image-dev.result == 'skipped'
|| needs.test-images.result == 'skipped'
|| (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|| needs.trigger-custom-extensions-build-and-wait.result == 'skipped'

View File

@@ -38,9 +38,6 @@ jobs:
runs-on: us-east-2
container:
image: neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:

View File

@@ -52,9 +52,8 @@ jobs:
- name: Test extension upgrade
timeout-minutes: 20
env:
NEW_COMPUTE_TAG: latest
OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
NEWTAG: latest
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
PG_VERSION: ${{ matrix.pg-version }}
FORCE_ALL_UPGRADE_TESTS: true
run: ./docker-compose/test_extensions_upgrade.sh

View File

@@ -1,147 +0,0 @@
name: large oltp benchmark
on:
# uncomment to run on push for debugging your PR
push:
branches: [ bodobolero/synthetic_oltp_workload ]
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
workflow_dispatch: # adds ability to run this manually
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow globally because we need dedicated resources which only exist once
group: large-oltp-bench-workflow
cancel-in-progress: true
jobs:
oltp:
strategy:
fail-fast: false # allow other variants to continue even if one fails
matrix:
include:
- target: new_branch
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
- target: reuse_branch
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h
TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
PG_VERSION: 16 # pre-determined by pre-determined project
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
PLATFORM: ${{ matrix.target }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
# Increase timeout to 8h, default timeout is 6h
timeout-minutes: 480
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials # necessary to download artefacts
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Create Neon Branch for large tenant
if: ${{ matrix.target == 'new_branch' }}
id: create-neon-branch-oltp-target
uses: ./.github/actions/neon-branch-create
with:
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Set up Connection String
id: set-up-connstr
run: |
case "${{ matrix.target }}" in
new_branch)
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
;;
reuse_branch)
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
;;
*)
echo >&2 "Unknown target=${{ matrix.target }}"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
- name: Benchmark pgbench with custom-scripts
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
pg_version: ${{ env.PG_VERSION }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Delete Neon Branch for large tenant
if: ${{ always() && matrix.target == 'new_branch' }}
uses: ./.github/actions/neon-branch-delete
with:
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
with:
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
slack-message: |
Periodic large oltp perf testing: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -71,7 +71,7 @@ jobs:
uses: ./.github/workflows/build-macos.yml
with:
pg_versions: ${{ needs.files-changed.outputs.postgres_changes }}
rebuild_rust_code: ${{ fromJson(needs.files-changed.outputs.rebuild_rust_code) }}
rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }}
rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }}
gather-rust-build-stats:

View File

@@ -78,10 +78,8 @@ jobs:
run: |
if [ -z "$INPUT_COMMIT_HASH" ]; then
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
else
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
fi
- name: Start Bench with run_id
@@ -91,7 +89,7 @@ jobs:
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-H "Authorization: Bearer $API_KEY" \
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
- name: Poll Test Status
id: poll_step

View File

@@ -65,7 +65,6 @@ jobs:
permissions:
id-token: write # Required for aws/azure login
packages: write # required for pushing to GHCR
uses: ./.github/workflows/_push-to-container-registry.yml
with:
@@ -73,15 +72,12 @@ jobs:
{
"docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [
"docker.io/neondatabase/build-tools:pinned-bullseye",
"ghcr.io/neondatabase/build-tools:pinned-bullseye",
"${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye",
"${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye"
],
"docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [
"docker.io/neondatabase/build-tools:pinned-bookworm",
"docker.io/neondatabase/build-tools:pinned",
"ghcr.io/neondatabase/build-tools:pinned-bookworm",
"ghcr.io/neondatabase/build-tools:pinned",
"${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm",
"${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned",
"${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm",
@@ -89,10 +85,12 @@ jobs:
]
}
aws-region: ${{ vars.AWS_ECR_REGION }}
aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
aws-role-to-assume: "gha-oidc-neon-admin"
aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
secrets: inherit
secrets:
aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

View File

@@ -5,10 +5,6 @@ on:
types:
- ready_for_review
workflow_call:
inputs:
github-event-name:
type: string
required: true
defaults:
run:
@@ -23,7 +19,7 @@ jobs:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ inputs.github-event-name || github.event_name }}
github-event-name: ${{ github.event_name }}
cancel-previous-e2e-tests:
needs: [ check-permissions ]
@@ -39,29 +35,46 @@ jobs:
run cancel-previous-in-concurrency-group.yml \
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
meta:
uses: ./.github/workflows/_meta.yml
with:
github-event-name: ${{ inputs.github-event-name || github.event_name }}
tag:
needs: [ check-permissions ]
runs-on: ubuntu-22.04
outputs:
build-tag: ${{ steps.build-tag.outputs.tag }}
steps:
# Need `fetch-depth: 0` to count the number of commits in the branch
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Get build tag
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }}
CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
fi
id: build-tag
trigger-e2e-tests:
needs: [ meta ]
needs: [ tag ]
runs-on: ubuntu-22.04
env:
EVENT_ACTION: ${{ github.event.action }}
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
TAG: >-
${{
contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.previous-storage-release
|| needs.meta.outputs.build-tag
}}
COMPUTE_TAG: >-
${{
contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
&& needs.meta.outputs.previous-compute-release
|| needs.meta.outputs.build-tag
}}
TAG: ${{ needs.tag.outputs.build-tag }}
steps:
- name: Wait for `push-{neon,compute}-image-dev` job to finish
# It's important to have a timeout here, the script in the step can run infinitely
@@ -144,6 +157,6 @@ jobs:
--raw-field "commit_hash=$COMMIT_SHA" \
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
--raw-field "storage_image_tag=${TAG}" \
--raw-field "compute_image_tag=${COMPUTE_TAG}" \
--raw-field "compute_image_tag=${TAG}" \
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
--raw-field "e2e-platforms=${E2E_PLATFORMS}"

233
Cargo.lock generated
View File

@@ -783,28 +783,6 @@ dependencies = [
"tracing",
]
[[package]]
name = "axum-extra"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
dependencies = [
"axum",
"axum-core",
"bytes",
"futures-util",
"headers",
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"mime",
"pin-project-lite",
"serde",
"tower 0.5.2",
"tower-layer",
"tower-service",
]
[[package]]
name = "azure_core"
version = "0.21.0"
@@ -947,9 +925,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
[[package]]
name = "base64"
version = "0.21.7"
version = "0.21.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
[[package]]
name = "base64"
@@ -1006,9 +984,9 @@ dependencies = [
[[package]]
name = "bindgen"
version = "0.71.1"
version = "0.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
dependencies = [
"bitflags 2.8.0",
"cexpr",
@@ -1019,7 +997,7 @@ dependencies = [
"proc-macro2",
"quote",
"regex",
"rustc-hash 2.1.1",
"rustc-hash",
"shlex",
"syn 2.0.90",
]
@@ -1327,7 +1305,6 @@ dependencies = [
"aws-sdk-s3",
"aws-smithy-types",
"axum",
"axum-extra",
"base64 0.13.1",
"bytes",
"camino",
@@ -1339,7 +1316,8 @@ dependencies = [
"flate2",
"futures",
"http 1.1.0",
"jsonwebtoken",
"hyper 1.4.1",
"hyper-util",
"metrics",
"nix 0.27.1",
"notify",
@@ -1366,9 +1344,7 @@ dependencies = [
"tokio-util",
"tower 0.5.2",
"tower-http",
"tower-otel",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
"tracing-utils",
"url",
@@ -1572,17 +1548,6 @@ dependencies = [
"itertools 0.10.5",
]
[[package]]
name = "cron"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5877d3fbf742507b66bc2a1945106bd30dd8504019d596901ddd012a4dd01740"
dependencies = [
"chrono",
"once_cell",
"winnow",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.8"
@@ -2321,7 +2286,7 @@ name = "framed-websockets"
version = "0.1.0"
source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
"bytemuck",
"bytes",
"futures-core",
@@ -2434,9 +2399,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
[[package]]
name = "futures-timer"
version = "3.0.3"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
[[package]]
name = "futures-util"
@@ -2539,27 +2504,6 @@ version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "governor"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0"
dependencies = [
"cfg-if",
"dashmap 6.1.0",
"futures-sink",
"futures-timer",
"futures-util",
"no-std-compat",
"nonzero_ext",
"parking_lot 0.12.1",
"portable-atomic",
"quanta",
"rand 0.8.5",
"smallvec",
"spinning_top",
]
[[package]]
name = "group"
version = "0.12.1"
@@ -2677,7 +2621,7 @@ version = "7.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
"byteorder",
"crossbeam-channel",
"flate2",
@@ -2685,30 +2629,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "headers"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9"
dependencies = [
"base64 0.21.7",
"bytes",
"headers-core",
"http 1.1.0",
"httpdate",
"mime",
"sha1",
]
[[package]]
name = "headers-core"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4"
dependencies = [
"http 1.1.0",
]
[[package]]
name = "heck"
version = "0.5.0"
@@ -2846,9 +2766,12 @@ name = "http-utils"
version = "0.1.0"
dependencies = [
"anyhow",
"backtrace",
"bytes",
"fail",
"flate2",
"hyper 0.14.30",
"inferno 0.12.0",
"itertools 0.10.5",
"jemalloc_pprof",
"metrics",
@@ -3347,9 +3270,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "jemalloc_pprof"
version = "0.7.0"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992"
checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
dependencies = [
"anyhow",
"libc",
@@ -3433,7 +3356,7 @@ version = "9.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
"js-sys",
"pem",
"ring",
@@ -3548,9 +3471,9 @@ dependencies = [
[[package]]
name = "mappings"
version = "0.7.0"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a"
checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
dependencies = [
"anyhow",
"libc",
@@ -3603,7 +3526,7 @@ dependencies = [
"measured-derive",
"memchr",
"parking_lot 0.12.1",
"rustc-hash 1.1.0",
"rustc-hash",
"ryu",
]
@@ -3791,12 +3714,6 @@ dependencies = [
"memoffset 0.9.0",
]
[[package]]
name = "no-std-compat"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
[[package]]
name = "nom"
version = "7.1.3"
@@ -3807,12 +3724,6 @@ dependencies = [
"minimal-lexical",
]
[[package]]
name = "nonzero_ext"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
[[package]]
name = "notify"
version = "8.0.0"
@@ -4263,6 +4174,7 @@ dependencies = [
"pageserver_client",
"pageserver_compaction",
"pin-project-lite",
"postgres",
"postgres-protocol",
"postgres-types",
"postgres_backend",
@@ -4349,6 +4261,7 @@ dependencies = [
"futures",
"http-utils",
"pageserver_api",
"postgres",
"reqwest",
"serde",
"thiserror 1.0.69",
@@ -4385,9 +4298,9 @@ dependencies = [
[[package]]
name = "papaya"
version = "0.2.0"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
dependencies = [
"equivalent",
"seize",
@@ -4515,7 +4428,7 @@ version = "3.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
"serde",
]
@@ -4564,18 +4477,18 @@ dependencies = [
[[package]]
name = "pin-project"
version = "1.1.9"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d"
checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.9"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67"
checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
dependencies = [
"proc-macro2",
"quote",
@@ -4669,12 +4582,6 @@ dependencies = [
"never-say-never",
]
[[package]]
name = "portable-atomic"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
[[package]]
name = "postgres"
version = "0.19.7"
@@ -4769,6 +4676,7 @@ dependencies = [
"anyhow",
"itertools 0.10.5",
"once_cell",
"postgres",
"tokio-postgres",
"url",
]
@@ -4839,14 +4747,12 @@ dependencies = [
[[package]]
name = "pprof_util"
version = "0.7.0"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416"
checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
dependencies = [
"anyhow",
"backtrace",
"flate2",
"inferno 0.12.0",
"num",
"paste",
"prost",
@@ -5098,7 +5004,7 @@ dependencies = [
"reqwest-tracing",
"rsa",
"rstest",
"rustc-hash 1.1.0",
"rustc-hash",
"rustls 0.23.18",
"rustls-native-certs 0.8.0",
"rustls-pemfile 2.1.1",
@@ -5138,21 +5044,6 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "quanta"
version = "0.12.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
dependencies = [
"crossbeam-utils",
"libc",
"once_cell",
"raw-cpuid",
"wasi 0.11.0+wasi-snapshot-preview1",
"web-sys",
"winapi",
]
[[package]]
name = "quick-xml"
version = "0.26.0"
@@ -5283,15 +5174,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "raw-cpuid"
version = "11.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e"
dependencies = [
"bitflags 2.8.0",
]
[[package]]
name = "rayon"
version = "1.7.0"
@@ -5740,12 +5622,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustc-hash"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
[[package]]
name = "rustc_version"
version = "0.4.0"
@@ -5862,7 +5738,7 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
]
[[package]]
@@ -5871,7 +5747,7 @@ version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
"rustls-pki-types",
]
@@ -5942,6 +5818,7 @@ dependencies = [
"once_cell",
"pageserver_api",
"parking_lot 0.12.1",
"postgres",
"postgres-protocol",
"postgres_backend",
"postgres_ffi",
@@ -6110,9 +5987,9 @@ dependencies = [
[[package]]
name = "seize"
version = "0.5.0"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
dependencies = [
"libc",
"windows-sys 0.52.0",
@@ -6505,15 +6382,6 @@ version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "spinning_top"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300"
dependencies = [
"lock_api",
]
[[package]]
name = "spki"
version = "0.6.0"
@@ -6584,13 +6452,11 @@ dependencies = [
"chrono",
"clap",
"control_plane",
"cron",
"diesel",
"diesel-async",
"diesel_migrations",
"fail",
"futures",
"governor",
"hex",
"http-utils",
"humantime",
@@ -7405,12 +7271,10 @@ version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
dependencies = [
"base64 0.22.1",
"bitflags 2.8.0",
"bytes",
"http 1.1.0",
"http-body 1.0.0",
"mime",
"pin-project-lite",
"tower-layer",
"tower-service",
@@ -7424,20 +7288,6 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
[[package]]
name = "tower-otel"
version = "0.2.0"
source = "git+https://github.com/mattiapenati/tower-otel?rev=56a7321053bcb72443888257b622ba0d43a11fcd#56a7321053bcb72443888257b622ba0d43a11fcd"
dependencies = [
"http 1.1.0",
"opentelemetry",
"pin-project",
"tower-layer",
"tower-service",
"tracing",
"tracing-opentelemetry",
]
[[package]]
name = "tower-service"
version = "0.3.3"
@@ -7764,6 +7614,7 @@ dependencies = [
"anyhow",
"arc-swap",
"async-compression",
"backtrace",
"bincode",
"byteorder",
"bytes",
@@ -8293,9 +8144,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "winnow"
version = "0.6.26"
version = "0.6.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28"
checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
dependencies = [
"memchr",
]
@@ -8317,7 +8168,7 @@ dependencies = [
"ahash",
"anyhow",
"base64 0.13.1",
"base64 0.21.7",
"base64 0.21.1",
"base64ct",
"bytes",
"camino",

View File

@@ -43,7 +43,7 @@ members = [
]
[workspace.package]
edition = "2024"
edition = "2021"
license = "Apache-2.0"
## All dependency versions, used in the project
@@ -53,6 +53,7 @@ anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
backtrace = "0.3.74"
flate2 = "1.0.26"
assert-json-diff = "2"
async-stream = "0.3"
@@ -67,17 +68,15 @@ aws-credential-types = "1.2.0"
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
aws-types = "1.3"
axum = { version = "0.8.1", features = ["ws"] }
axum-extra = { version = "0.10.0", features = ["typed-header"] }
base64 = "0.13.0"
bincode = "1.3"
bindgen = "0.71"
bindgen = "0.70"
bit_field = "0.10.2"
bstr = "1.0"
byteorder = "1.4"
bytes = "1.9"
camino = "1.1.6"
cfg-if = "1.0.0"
cron = "0.15"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive", "env"] }
clashmap = { version = "1.0", features = ["raw-api"] }
@@ -95,7 +94,6 @@ futures = "0.3"
futures-core = "0.3"
futures-util = "0.3"
git-version = "0.3"
governor = "0.8"
hashbrown = "0.14"
hashlink = "0.9.1"
hdrhistogram = "7.5.2"
@@ -114,10 +112,11 @@ hyper-util = "0.1"
tokio-tungstenite = "0.21.0"
indexmap = "2"
indoc = "2"
inferno = "0.12.0"
ipnet = "2.10.0"
itertools = "0.10"
itoa = "1.0.11"
jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
jemalloc_pprof = "0.6"
jsonwebtoken = "9"
lasso = "0.7"
libc = "0.2"
@@ -192,11 +191,7 @@ toml = "0.8"
toml_edit = "0.22"
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
tower = { version = "0.5.2", default-features = false }
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
# This revision uses opentelemetry 0.27. There's no tag for it.
tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }
tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
tower-service = "0.3.3"
tracing = "0.1"
tracing-error = "0.2"

View File

@@ -11,16 +11,15 @@ ICU_PREFIX_DIR := /usr/local/icu
#
BUILD_TYPE ?= debug
WITH_SANITIZERS ?= no
PG_CFLAGS = -fsigned-char
ifeq ($(BUILD_TYPE),release)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
PG_CFLAGS += -O2 -g3 $(CFLAGS)
PG_CFLAGS = -O2 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
# Unfortunately, `--profile=...` is a nightly feature
CARGO_BUILD_FLAGS += --release
else ifeq ($(BUILD_TYPE),debug)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
PG_CFLAGS += -O0 -g3 $(CFLAGS)
PG_CFLAGS = -O0 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
else
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
@@ -160,8 +159,6 @@ postgres-%: postgres-configure-% \
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
+@echo "Compiling pageinspect $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+@echo "Compiling pg_trgm $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
+@echo "Compiling amcheck $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
+@echo "Compiling test_decoding $*"

View File

@@ -162,7 +162,7 @@ FROM build-deps AS pg-build
ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION:?} postgres
RUN cd postgres && \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
--with-icu --with-libxml --with-libxslt --with-lz4" && \
if [ "${PG_VERSION:?}" != "v14" ]; then \
# zstd is available only from PG15
@@ -1458,11 +1458,9 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
FROM build-deps AS pg_mooncake-src
ARG PG_VERSION
WORKDIR /ext-src
COPY compute/patches/duckdb_v113.patch .
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \
echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \
mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
cd third_party/duckdb && patch -p1 < /ext-src/duckdb_v113.patch && cd ../.. && \
echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \
chmod a+x neon-test.sh
@@ -1482,7 +1480,6 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
FROM build-deps AS pg_duckdb-src
WORKDIR /ext-src
COPY compute/patches/pg_duckdb_v031.patch .
COPY compute/patches/duckdb_v120.patch .
# pg_duckdb build requires source dir to be a git repo to get submodules
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# - extension management function duckdb.install_extension()
@@ -1490,9 +1487,7 @@ COPY compute/patches/duckdb_v120.patch .
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
cd pg_duckdb-src && \
git submodule update --init --recursive && \
patch -p1 < /ext-src/pg_duckdb_v031.patch && \
cd third_party/duckdb && \
patch -p1 < /ext-src/duckdb_v120.patch
patch -p1 < /ext-src/pg_duckdb_v031.patch
FROM pg-build AS pg_duckdb-build
ARG PG_VERSION
@@ -1681,7 +1676,11 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
# Disabled temporarily, because it clashed with pg_mooncake. pg_mooncake
# also depends on libduckdb, but a different version.
#COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1818,7 +1817,7 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute
FROM pg-build AS extension-tests
ARG PG_VERSION
COPY docker-compose/ext-src/ /ext-src/
RUN mkdir /ext-src
COPY --from=pg-build /postgres /postgres
#COPY --from=postgis-src /ext-src/ /ext-src/

View File

@@ -1,25 +0,0 @@
diff --git a/libduckdb.map b/libduckdb.map
new file mode 100644
index 0000000000..3b56f00cd7
--- /dev/null
+++ b/libduckdb.map
@@ -0,0 +1,6 @@
+DUCKDB_1.1.3 {
+ global:
+ *duckdb*;
+ local:
+ *;
+};
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3e757a4bcc..88ab4005b9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -135,6 +135,8 @@ else()
target_link_libraries(duckdb ${DUCKDB_LINK_LIBS})
link_threads(duckdb)
link_extension_libraries(duckdb)
+ target_link_options(duckdb PRIVATE
+ -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb.map)
add_library(duckdb_static STATIC ${ALL_OBJECT_FILES})
target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS})

View File

@@ -1,67 +0,0 @@
diff --git a/libduckdb_pg_duckdb.map b/libduckdb_pg_duckdb.map
new file mode 100644
index 0000000000..0872978b48
--- /dev/null
+++ b/libduckdb_pg_duckdb.map
@@ -0,0 +1,6 @@
+DUCKDB_1.2.0 {
+ global:
+ *duckdb*;
+ local:
+ *;
+};
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 58adef3fc0..2c522f91be 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -59,7 +59,7 @@ endfunction()
if(AMALGAMATION_BUILD)
- add_library(duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp")
+ add_library(duckdb_pg_duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp")
target_link_libraries(duckdb ${DUCKDB_SYSTEM_LIBS})
link_threads(duckdb)
link_extension_libraries(duckdb)
@@ -109,7 +109,7 @@ else()
duckdb_yyjson
duckdb_zstd)
- add_library(duckdb SHARED ${ALL_OBJECT_FILES})
+ add_library(duckdb_pg_duckdb SHARED ${ALL_OBJECT_FILES})
if(WIN32 AND NOT MINGW)
ensure_variable_is_number(DUCKDB_MAJOR_VERSION RC_MAJOR_VERSION)
@@ -131,9 +131,11 @@ else()
target_sources(duckdb PRIVATE version.rc)
endif()
- target_link_libraries(duckdb ${DUCKDB_LINK_LIBS})
- link_threads(duckdb)
- link_extension_libraries(duckdb)
+ target_link_libraries(duckdb_pg_duckdb ${DUCKDB_LINK_LIBS})
+ link_threads(duckdb_pg_duckdb)
+ link_extension_libraries(duckdb_pg_duckdb)
+ target_link_options(duckdb_pg_duckdb PRIVATE
+ -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb_pg_duckdb.map)
add_library(duckdb_static STATIC ${ALL_OBJECT_FILES})
target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS})
@@ -141,7 +143,7 @@ else()
link_extension_libraries(duckdb_static)
target_include_directories(
- duckdb PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+ duckdb_pg_duckdb PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
target_include_directories(
@@ -161,7 +163,7 @@ else()
endif()
install(
- TARGETS duckdb duckdb_static
+ TARGETS duckdb_pg_duckdb duckdb_static
EXPORT "${DUCKDB_EXPORT_SET}"
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"

View File

@@ -1,25 +1,3 @@
diff --git a/Makefile b/Makefile
index 3235cc8..6b892bc 100644
--- a/Makefile
+++ b/Makefile
@@ -32,7 +32,7 @@ else
DUCKDB_BUILD_TYPE = release
endif
-DUCKDB_LIB = libduckdb$(DLSUFFIX)
+DUCKDB_LIB = libduckdb_pg_duckdb$(DLSUFFIX)
FULL_DUCKDB_LIB = third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src/$(DUCKDB_LIB)
ERROR_ON_WARNING ?=
@@ -54,7 +54,7 @@ override PG_CXXFLAGS += -std=c++17 ${DUCKDB_BUILD_CXX_FLAGS} ${COMPILER_FLAGS} -
# changes to the vendored code in one place.
override PG_CFLAGS += -Wno-declaration-after-statement
-SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb -lstdc++ -llz4
+SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb_pg_duckdb -lstdc++ -llz4
include Makefile.global
diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
index d777d76..af60106 100644
--- a/sql/pg_duckdb--0.2.0--0.3.0.sql

View File

@@ -5,16 +5,6 @@ commands:
user: root
sysvInitAction: sysinit
shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
# restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
# running it as root.
- name: chmod-resize-swap
user: root
sysvInitAction: sysinit
shell: 'chmod 711 /neonvm/bin/resize-swap'
- name: chmod-set-disk-quota
user: root
sysvInitAction: sysinit
shell: 'chmod 711 /neonvm/bin/set-disk-quota'
- name: pgbouncer
user: postgres
sysvInitAction: respawn
@@ -44,17 +34,9 @@ shutdownHook: |
files:
- filename: compute_ctl-sudoers
content: |
# Reverse hostname lookup doesn't currently work, and isn't needed anyway when all
# the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to
# resolve host" log messages that they generate.
Defaults !fqdn
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
# regardless of hostname (ALL)
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
# Allow postgres user to shut down the VM. The fast_import job does that
# when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/poweroff
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -116,10 +98,7 @@ merge: |
&& echo 'root - nofile 1048576' >>/etc/security/limits.conf \
)
# Allow postgres user (compute_ctl) to run swap resizer.
# Need to install sudo in order to allow this.
#
# Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
# Install sudo so that the fast_import can do "sudo poweroff"
RUN set -e \
&& apt update \
&& apt install --no-install-recommends -y \

View File

@@ -5,16 +5,6 @@ commands:
user: root
sysvInitAction: sysinit
shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
# restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
# running it as root.
- name: chmod-resize-swap
user: root
sysvInitAction: sysinit
shell: 'chmod 711 /neonvm/bin/resize-swap'
- name: chmod-set-disk-quota
user: root
sysvInitAction: sysinit
shell: 'chmod 711 /neonvm/bin/set-disk-quota'
- name: pgbouncer
user: postgres
sysvInitAction: respawn
@@ -44,17 +34,9 @@ shutdownHook: |
files:
- filename: compute_ctl-sudoers
content: |
# Reverse hostname lookup doesn't currently work, and isn't needed anyway when all
# the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to
# resolve host" log messages that they generate.
Defaults !fqdn
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
# regardless of hostname (ALL)
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
# Allow postgres user to shut down the VM. The fast_import job does that
# when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/poweroff
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -112,10 +94,7 @@ merge: |
&& echo 'root - nofile 1048576' >>/etc/security/limits.conf \
)
# Allow postgres user (compute_ctl) to run swap resizer.
# Need to install sudo in order to allow this.
#
# Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
# Install sudo so that the fast_import can do "sudo poweroff"
RUN set -e \
&& apt update \
&& apt install --no-install-recommends -y \

View File

@@ -1,7 +1,7 @@
[package]
name = "compute_tools"
version = "0.1.0"
edition = "2024"
edition.workspace = true
license.workspace = true
[features]
@@ -17,7 +17,6 @@ aws-sdk-kms.workspace = true
aws-smithy-types.workspace = true
anyhow.workspace = true
axum = { workspace = true, features = [] }
axum-extra.workspace = true
camino.workspace = true
chrono.workspace = true
cfg-if.workspace = true
@@ -26,7 +25,8 @@ fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
jsonwebtoken.workspace = true
hyper-util.workspace = true
hyper.workspace = true
metrics.workspace = true
nix.workspace = true
notify.workspace = true
@@ -48,9 +48,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-util.workspace = true
tokio-stream.workspace = true
tower-otel.workspace = true
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true
thiserror.workspace = true

View File

@@ -33,28 +33,40 @@
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway \
//! ```
use std::collections::HashMap;
use std::ffi::OsString;
use std::fs::File;
use std::path::Path;
use std::process::exit;
use std::sync::mpsc;
use std::thread;
use std::time::Duration;
use std::str::FromStr;
use std::sync::atomic::Ordering;
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Parser;
use compute_api::responses::ComputeCtlConfig;
use compute_tools::http::server::Server;
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
use compute_tools::neonvmd_client::{resize_swap, set_disk_quota};
use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info, warn};
use url::Url;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version_string;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::spec::*;
use rlimit::{Resource, setrlimit};
use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM};
use signal_hook::iterator::Signals;
use tracing::{error, info};
use url::Url;
use rlimit::{setrlimit, Resource};
use utils::failpoint_support;
// this is an arbitrary build tag. Fine as a default / for testing purposes
@@ -117,6 +129,7 @@ struct Cli {
#[arg(long, action = clap::ArgAction::SetTrue)]
pub resize_swap_on_bind: bool,
/// This is no longer used for anything. It's kept for now just for backwards-compatibility.
#[arg(long)]
pub set_disk_quota_for_fs: Option<String>,
@@ -136,8 +149,6 @@ struct Cli {
fn main() -> Result<()> {
let cli = Cli::parse();
let scenario = failpoint_support::init();
// For historical reasons, the main thread that processes the spec and launches postgres
// is synchronous, but we always have this tokio runtime available and we "enter" it so
// that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
@@ -149,44 +160,34 @@ fn main() -> Result<()> {
let build_tag = runtime.block_on(init())?;
let scenario = failpoint_support::init();
// enable core dumping for all child processes
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let (pg_handle, start_pg_result) = {
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();
let cli_spec = try_spec_from_cli(&cli)?;
let cli_spec = try_spec_from_cli(&cli)?;
let compute_node = ComputeNode::new(
ComputeNodeParams {
compute_id: cli.compute_id,
connstr,
pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port,
ext_remote_storage: cli.remote_ext_config.clone(),
resize_swap_on_bind: cli.resize_swap_on_bind,
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
#[cfg(target_os = "linux")]
filecache_connstr: cli.filecache_connstr,
#[cfg(target_os = "linux")]
cgroup: cli.cgroup,
#[cfg(target_os = "linux")]
vm_monitor_addr: cli.vm_monitor_addr,
build_tag,
let compute = wait_spec(build_tag, &cli, cli_spec)?;
live_config_allowed: cli_spec.live_config_allowed,
},
cli_spec.spec,
cli_spec.compute_ctl_config,
)?;
start_postgres(&cli, compute)?
let exit_code = compute_node.run()?;
// Startup is finished, exit the startup tracing span
};
// PostgreSQL is now running, if startup was successful. Wait until it exits.
let wait_pg_result = wait_postgres(pg_handle)?;
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
maybe_delay_exit(delay_exit);
scenario.teardown();
deinit_and_exit(exit_code);
deinit_and_exit(wait_pg_result);
}
async fn init() -> Result<String> {
@@ -207,6 +208,56 @@ async fn init() -> Result<String> {
Ok(build_tag)
}
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
//
// This is used to propagate the context for the 'start_compute' operation
// from the neon control plane. This allows linking together the wider
// 'start_compute' operation that creates the compute container, with the
// startup actions here within the container.
//
// There is no standard for passing context in env variables, but a lot of
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
//
// Switch to the startup context here, and exit it once the startup has
// completed and Postgres is up and running.
//
// If this pod is pre-created without binding it to any particular endpoint
// yet, this isn't the right place to enter the startup context. In that
// case, the control plane should pass the tracing context as part of the
// /configure API call.
//
// NOTE: This is supposed to only cover the *startup* actions. Once
// postgres is configured and up-and-running, we exit this span. Any other
// actions that are performed on incoming HTTP requests, for example, are
// performed in separate spans.
//
// XXX: If the pod is restarted, we perform the startup actions in the same
// context as the original startup actions, which probably doesn't make
// sense.
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
if let Ok(val) = std::env::var("TRACEPARENT") {
startup_tracing_carrier.insert("traceparent".to_string(), val);
}
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry_sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new()
.extract(&startup_tracing_carrier)
.attach();
info!("startup tracing context attached");
Some(guard)
} else {
None
}
}
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
// First, try to get cluster spec from the cli argument
if let Some(ref spec_json) = cli.spec_json {
@@ -257,7 +308,340 @@ struct CliSpecParams {
live_config_allowed: bool,
}
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
fn wait_spec(
build_tag: String,
cli: &Cli,
CliSpecParams {
spec,
live_config_allowed,
compute_ctl_config: _,
}: CliSpecParams,
) -> Result<Arc<ComputeNode>> {
let mut new_state = ComputeState::new();
let spec_set;
if let Some(spec) = spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
info!("new pspec.spec: {:?}", pspec.spec);
new_state.pspec = Some(pspec);
spec_set = true;
} else {
spec_set = false;
}
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let conn_conf = postgres::config::Config::from_str(connstr.as_str())
.context("cannot build postgres config from connstr")?;
let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
.context("cannot build tokio postgres config from connstr")?;
let compute_node = ComputeNode {
compute_id: cli.compute_id.clone(),
connstr,
conn_conf,
tokio_conn_conf,
pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port,
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_remote_storage: cli.remote_ext_config.clone(),
ext_download_progress: RwLock::new(HashMap::new()),
build_tag,
};
let compute = Arc::new(compute_node);
// If this is a pooled VM, prewarm before starting HTTP server and becoming
// available for binding. Prewarming helps Postgres start quicker later,
// because QEMU will already have its memory allocated from the host, and
// the necessary binaries will already be cached.
if !spec_set {
compute.prewarm_postgres()?;
}
// Launch the external HTTP server first, so that we can serve control plane
// requests while configuration is still in progress.
Server::External(cli.external_http_port).launch(&compute);
// The internal HTTP server could be launched later, but there isn't much
// sense in waiting.
Server::Internal(cli.internal_http_port).launch(&compute);
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
let mut state = compute.state.lock().unwrap();
while state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap();
if state.status == ComputeStatus::ConfigurationPending {
info!("got spec, continue configuration");
// Spec is already set by the http server handler.
break;
}
}
// Record for how long we slept waiting for the spec.
let now = Utc::now();
state.metrics.wait_for_spec_ms = now
.signed_duration_since(state.start_time)
.to_std()
.unwrap()
.as_millis() as u64;
// Reset start time, so that the total startup time that is calculated later will
// not include the time that we waited for the spec.
state.start_time = now;
}
launch_lsn_lease_bg_task_for_static(&compute);
Ok(compute)
}
fn start_postgres(
cli: &Cli,
compute: Arc<ComputeNode>,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state.
let mut state = compute.state.lock().unwrap();
state.set_status(ComputeStatus::Init, &compute.state_changed);
info!(
"running compute with features: {:?}",
state.pspec.as_ref().unwrap().spec.features
);
// before we release the mutex, fetch some parameters for later.
let &ComputeSpec {
swap_size_bytes,
disk_quota_bytes,
#[cfg(target_os = "linux")]
disable_lfc_resizing,
..
} = &state.pspec.as_ref().unwrap().spec;
drop(state);
// Launch remaining service threads
let _monitor_handle = launch_monitor(&compute);
let _configurator_handle = launch_configurator(&compute);
let mut prestartup_failed = false;
let mut delay_exit = false;
// Resize swap to the desired size if the compute spec says so
if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
// To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
// *before* starting postgres.
//
// In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
// carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
// OOM-killed during startup because swap wasn't available yet.
match resize_swap(size_bytes) {
Ok(()) => {
let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%size_bytes, %size_mib, "resized swap");
}
Err(err) => {
let err = err.context("failed to resize swap");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
compute.set_failed_status(err);
delay_exit = true;
}
}
}
// Set disk quota if the compute spec says so
if let Some(disk_quota_bytes) = disk_quota_bytes {
match set_disk_quota(disk_quota_bytes) {
Ok(()) => {
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%disk_quota_bytes, %size_mib, "set disk quota");
}
Err(err) => {
let err = err.context("failed to set disk quota");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
compute.set_failed_status(err);
delay_exit = true;
}
}
}
// Start Postgres
let mut pg = None;
if !prestartup_failed {
pg = match compute.start_compute() {
Ok(pg) => {
info!(postmaster_pid = %pg.0.id(), "Postgres was started");
Some(pg)
}
Err(err) => {
error!("could not start the compute node: {:#}", err);
compute.set_failed_status(err);
delay_exit = true;
None
}
};
} else {
warn!("skipping postgres startup because pre-startup step failed");
}
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
// because it requires cgroups.
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
use std::env;
use tokio_util::sync::CancellationToken;
// This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new();
// don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
None
} else {
Some(cli.filecache_connstr.clone())
};
let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
let vm_monitor = tokio::spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args {
cgroup: Some(cli.cgroup.clone()),
pgconnstr,
addr: cli.vm_monitor_addr.clone(),
})),
token.clone(),
));
Some(vm_monitor)
} else {
None
};
}
}
Ok((
pg,
StartPostgresResult {
delay_exit,
compute,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
vm_monitor,
},
))
}
type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
struct StartPostgresResult {
delay_exit: bool,
// passed through from WaitSpecResult
compute: Arc<ComputeNode>,
#[cfg(target_os = "linux")]
token: tokio_util::sync::CancellationToken,
#[cfg(target_os = "linux")]
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
PG_PID.store(0, Ordering::SeqCst);
// Process has exited. Wait for the log collecting task to finish.
let _ = tokio::runtime::Handle::current()
.block_on(logs_handle)
.map_err(|e| tracing::error!("log task panicked: {:?}", e));
info!("Postgres exited with code {}, shutting down", ecode);
exit_code = ecode.code()
}
Ok(WaitPostgresResult { exit_code })
}
struct WaitPostgresResult {
exit_code: Option<i32>,
}
fn cleanup_after_postgres_exit(
StartPostgresResult {
mut delay_exit,
compute,
#[cfg(target_os = "linux")]
vm_monitor,
#[cfg(target_os = "linux")]
token,
}: StartPostgresResult,
) -> Result<bool> {
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
if let Some(handle) = vm_monitor {
// Kills all threads spawned by the monitor
token.cancel();
// Kills the actual task running the monitor
handle.abort();
}
}
}
// Maybe sync safekeepers again, to speed up next startup
let compute_state = compute.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
info!("syncing safekeepers on shutdown");
let storage_auth_token = pspec.storage_auth_token.clone();
let lsn = compute.sync_safekeepers(storage_auth_token)?;
info!("synced safekeepers at lsn {lsn}");
}
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::TerminationPending {
state.status = ComputeStatus::Terminated;
compute.state_changed.notify_all();
// we were asked to terminate gracefully, don't exit to avoid restart
delay_exit = true
}
drop(state);
if let Err(err) = compute.check_for_core_dumps() {
error!("error while checking for core dumps: {err:?}");
}
Ok(delay_exit)
}
fn maybe_delay_exit(delay_exit: bool) {
// If launch failed, keep serving HTTP requests for a while, so the cloud
// control plane can get the actual error.
if delay_exit {
info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30));
}
}
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may
// hang for quite some time, see, for example:

View File

@@ -25,13 +25,13 @@
//! docker push localhost:3030/localregistry/compute-node-v14:latest
//! ```
use anyhow::{Context, bail};
use anyhow::{bail, Context};
use aws_config::BehaviorVersion;
use camino::{Utf8Path, Utf8PathBuf};
use clap::{Parser, Subcommand};
use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version};
use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
use nix::unistd::Pid;
use tracing::{Instrument, error, info, info_span, warn};
use tracing::{error, info, info_span, warn, Instrument};
use utils::fs_ext::is_directory_empty;
#[path = "fast_import/aws_s3_sync.rs"]
@@ -558,9 +558,7 @@ async fn cmd_dumprestore(
decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext)
.await?
} else {
bail!(
"destination connection string must be provided in spec for dump_restore command"
);
bail!("destination connection string must be provided in spec for dump_restore command");
};
(source, dest)

View File

@@ -1,10 +1,11 @@
use camino::{Utf8Path, Utf8PathBuf};
use tokio::task::JoinSet;
use tracing::{info, warn};
use walkdir::WalkDir;
use super::s3_uri::S3Uri;
use tracing::{info, warn};
const MAX_PARALLEL_UPLOADS: usize = 10;
/// Upload all files from 'local' to 'remote'

View File

@@ -1,6 +1,5 @@
use std::str::FromStr;
use anyhow::Result;
use std::str::FromStr;
/// Struct to hold parsed S3 components
#[derive(Debug, Clone, PartialEq, Eq)]

View File

@@ -1,20 +1,18 @@
use std::path::Path;
use std::process::Stdio;
use std::result::Result;
use std::sync::Arc;
use compute_api::responses::CatalogObjects;
use futures::Stream;
use postgres::NoTls;
use tokio::io::{AsyncBufReadExt, BufReader};
use tokio::process::Command;
use tokio::spawn;
use std::{path::Path, process::Stdio, result::Result, sync::Arc};
use tokio::{
io::{AsyncBufReadExt, BufReader},
process::Command,
spawn,
};
use tokio_stream::{self as stream, StreamExt};
use tokio_util::codec::{BytesCodec, FramedRead};
use tracing::warn;
use crate::compute::ComputeNode;
use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db};
use compute_api::responses::CatalogObjects;
pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles"));
@@ -57,15 +55,15 @@ pub enum SchemaDumpError {
pub async fn get_database_schema(
compute: &Arc<ComputeNode>,
dbname: &str,
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>> + use<>, SchemaDumpError> {
let pgbin = &compute.params.pgbin;
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
let pgbin = &compute.pgbin;
let basepath = Path::new(pgbin).parent().unwrap();
let pgdump = basepath.join("pg_dump");
// Replace the DB in the connection string and disable it to parts.
// This is the only option to handle DBs with special characters.
let conf = postgres_conf_for_db(&compute.params.connstr, dbname)
.map_err(|_| SchemaDumpError::Unexpected)?;
let conf =
postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
let host = conf
.get_hosts()
.first()

View File

@@ -1,4 +1,4 @@
use anyhow::{Ok, Result, anyhow};
use anyhow::{anyhow, Ok, Result};
use tokio_postgres::NoTls;
use tracing::{error, instrument, warn};

File diff suppressed because it is too large Load Diff

View File

@@ -1,14 +1,13 @@
use std::fmt::Write as FmtWrite;
use std::fs::{File, OpenOptions};
use std::io;
use std::io::Write;
use std::io::prelude::*;
use std::path::Path;
use anyhow::Result;
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value};
use crate::pg_helpers::escape_conf_value;
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
@@ -57,20 +56,10 @@ pub fn write_postgres_conf(
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if !spec.safekeeper_connstrings.is_empty() {
let mut neon_safekeepers_value = String::new();
tracing::info!(
"safekeepers_connstrings is not zero, gen: {:?}",
spec.safekeepers_generation
);
// If generation is given, prepend sk list with g#number:
if let Some(generation) = spec.safekeepers_generation {
write!(neon_safekeepers_value, "g#{}:", generation)?;
}
neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
writeln!(
file,
"neon.safekeepers={}",
escape_conf_value(&neon_safekeepers_value)
escape_conf_value(&spec.safekeeper_connstrings.join(","))
)?;
}
if let Some(s) = &spec.tenant_id {

View File

@@ -1,9 +1,10 @@
use std::sync::Arc;
use std::thread;
use compute_api::responses::ComputeStatus;
use tracing::{error, info, instrument};
use compute_api::responses::ComputeStatus;
use crate::compute::ComputeNode;
#[instrument(skip_all)]

View File

@@ -1,27 +0,0 @@
use anyhow::Context;
use tracing::instrument;
pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota";
/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set.
#[instrument]
pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> {
let size_kb = size_bytes / 1024;
// run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`
let child_result = std::process::Command::new("/usr/bin/sudo")
.arg(DISK_QUOTA_BIN)
.arg(size_kb.to_string())
.arg(fs_mountpoint)
.spawn();
child_result
.context("spawn() failed")
.and_then(|mut child| child.wait().context("wait() failed"))
.and_then(|status| match status.success() {
true => Ok(()),
false => Err(anyhow::anyhow!("process exited with {status}")),
})
// wrap any prior error with the overall context that we couldn't run the command
.with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`"))
}

View File

@@ -71,15 +71,15 @@ More specifically, here is an example ext_index.json
}
}
*/
use std::path::Path;
use std::str;
use anyhow::{Context, Result, bail};
use anyhow::Result;
use anyhow::{bail, Context};
use bytes::Bytes;
use compute_api::spec::RemoteExtSpec;
use regex::Regex;
use remote_storage::*;
use reqwest::StatusCode;
use std::path::Path;
use std::str;
use tar::Archive;
use tracing::info;
use tracing::log::warn;
@@ -244,10 +244,7 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
info!("writing file {:?}{:?}", control_path, control_content);
std::fs::write(control_path, control_content).unwrap();
} else {
warn!(
"control file {:?} exists both locally and remotely. ignoring the remote version.",
control_path
);
warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path);
}
}
}

View File

@@ -1,7 +1,6 @@
use std::ops::{Deref, DerefMut};
use axum::extract::rejection::JsonRejection;
use axum::extract::{FromRequest, Request};
use axum::extract::{rejection::JsonRejection, FromRequest, Request};
use compute_api::responses::GenericAPIError;
use http::StatusCode;

View File

@@ -1,9 +1,7 @@
pub(crate) mod json;
pub(crate) mod path;
pub(crate) mod query;
pub(crate) mod request_id;
pub(crate) use json::Json;
pub(crate) use path::Path;
pub(crate) use query::Query;
pub(crate) use request_id::RequestId;

View File

@@ -1,10 +1,8 @@
use std::ops::{Deref, DerefMut};
use axum::extract::FromRequestParts;
use axum::extract::rejection::PathRejection;
use axum::extract::{rejection::PathRejection, FromRequestParts};
use compute_api::responses::GenericAPIError;
use http::StatusCode;
use http::request::Parts;
use http::{request::Parts, StatusCode};
/// Custom `Path` extractor, so that we can format errors into
/// `JsonResponse<GenericAPIError>`.

View File

@@ -1,10 +1,8 @@
use std::ops::{Deref, DerefMut};
use axum::extract::FromRequestParts;
use axum::extract::rejection::QueryRejection;
use axum::extract::{rejection::QueryRejection, FromRequestParts};
use compute_api::responses::GenericAPIError;
use http::StatusCode;
use http::request::Parts;
use http::{request::Parts, StatusCode};
/// Custom `Query` extractor, so that we can format errors into
/// `JsonResponse<GenericAPIError>`.

View File

@@ -1,86 +0,0 @@
use std::{
fmt::Display,
ops::{Deref, DerefMut},
};
use axum::{extract::FromRequestParts, response::IntoResponse};
use http::{StatusCode, request::Parts};
use crate::http::{JsonResponse, headers::X_REQUEST_ID};
/// Extract the request ID from the `X-Request-Id` header.
#[derive(Debug, Clone, Default)]
pub(crate) struct RequestId(pub String);
#[derive(Debug)]
/// Rejection used for [`RequestId`].
///
/// Contains one variant for each way the [`RequestId`] extractor can
/// fail.
pub(crate) enum RequestIdRejection {
/// The request is missing the header.
MissingRequestId,
/// The value of the header is invalid UTF-8.
InvalidUtf8,
}
impl RequestIdRejection {
pub fn status(&self) -> StatusCode {
match self {
RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR,
RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST,
}
}
pub fn message(&self) -> String {
match self {
RequestIdRejection::MissingRequestId => "request ID is missing",
RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8",
}
.to_string()
}
}
impl IntoResponse for RequestIdRejection {
fn into_response(self) -> axum::response::Response {
JsonResponse::error(self.status(), self.message())
}
}
impl<S> FromRequestParts<S> for RequestId
where
S: Send + Sync,
{
type Rejection = RequestIdRejection;
async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
match parts.headers.get(X_REQUEST_ID) {
Some(value) => match value.to_str() {
Ok(request_id) => Ok(Self(request_id.to_string())),
Err(_) => Err(RequestIdRejection::InvalidUtf8),
},
None => Err(RequestIdRejection::MissingRequestId),
}
}
}
impl Deref for RequestId {
type Target = String;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for RequestId {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl Display for RequestId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.0)
}
}

View File

@@ -1,2 +0,0 @@
/// Constant for `X-Request-Id` header.
pub const X_REQUEST_ID: &str = "x-request-id";

View File

@@ -1,145 +0,0 @@
use std::{collections::HashSet, net::SocketAddr};
use anyhow::{Result, anyhow};
use axum::{RequestExt, body::Body, extract::ConnectInfo};
use axum_extra::{
TypedHeader,
headers::{Authorization, authorization::Bearer},
};
use futures::future::BoxFuture;
use http::{Request, Response, StatusCode};
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
use serde::Deserialize;
use tower_http::auth::AsyncAuthorizeRequest;
use tracing::warn;
use crate::http::{JsonResponse, extract::RequestId};
#[derive(Clone, Debug, Deserialize)]
pub(in crate::http) struct Claims {
compute_id: String,
}
#[derive(Clone, Debug)]
pub(in crate::http) struct Authorize {
compute_id: String,
jwks: JwkSet,
validation: Validation,
}
impl Authorize {
pub fn new(compute_id: String, jwks: JwkSet) -> Self {
let mut validation = Validation::new(Algorithm::EdDSA);
// Nothing is currently required
validation.required_spec_claims = HashSet::new();
validation.validate_exp = true;
// Unused by the control plane
validation.validate_aud = false;
// Unused by the control plane
validation.validate_nbf = false;
Self {
compute_id,
jwks,
validation,
}
}
}
impl AsyncAuthorizeRequest<Body> for Authorize {
type RequestBody = Body;
type ResponseBody = Body;
type Future = BoxFuture<'static, Result<Request<Body>, Response<Self::ResponseBody>>>;
fn authorize(&mut self, mut request: Request<Body>) -> Self::Future {
let compute_id = self.compute_id.clone();
let jwks = self.jwks.clone();
let validation = self.validation.clone();
Box::pin(async move {
let request_id = request.extract_parts::<RequestId>().await.unwrap();
// TODO: Remove this check after a successful rollout
if jwks.keys.is_empty() {
warn!(%request_id, "Authorization has not been configured");
return Ok(request);
}
let connect_info = request
.extract_parts::<ConnectInfo<SocketAddr>>()
.await
.unwrap();
// In the event the request is coming from the loopback interface,
// allow all requests
if connect_info.ip().is_loopback() {
warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
return Ok(request);
}
let TypedHeader(Authorization(bearer)) = request
.extract_parts::<TypedHeader<Authorization<Bearer>>>()
.await
.map_err(|_| {
JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token")
})?;
let data = match Self::verify(&jwks, bearer.token(), &validation) {
Ok(claims) => claims,
Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
};
if data.claims.compute_id != compute_id {
return Err(JsonResponse::error(
StatusCode::UNAUTHORIZED,
"invalid claims in authorization token",
));
}
// Make claims available to any subsequent middleware or request
// handlers
request.extensions_mut().insert(data.claims);
Ok(request)
})
}
}
impl Authorize {
/// Verify the token using the JSON Web Key set and return the token data.
fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
debug_assert!(!jwks.keys.is_empty());
for jwk in jwks.keys.iter() {
let decoding_key = match DecodingKey::from_jwk(jwk) {
Ok(key) => key,
Err(e) => {
warn!(
"Failed to construct decoding key from {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
continue;
}
};
match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
Ok(data) => return Ok(data),
Err(e) => {
warn!(
"Failed to decode authorization token using {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
continue;
}
}
}
Err(anyhow!("Failed to verify authorization token"))
}
}

View File

@@ -1 +0,0 @@
pub(in crate::http) mod authorize;

View File

@@ -1,14 +1,10 @@
use axum::body::Body;
use axum::response::Response;
use axum::{body::Body, response::Response};
use compute_api::responses::{ComputeStatus, GenericAPIError};
use http::StatusCode;
use http::header::CONTENT_TYPE;
use http::{header::CONTENT_TYPE, StatusCode};
use serde::Serialize;
use tracing::error;
mod extract;
mod headers;
mod middleware;
mod routes;
pub mod server;

View File

@@ -1,13 +1,10 @@
use std::sync::Arc;
use axum::extract::State;
use axum::response::Response;
use axum::{extract::State, response::Response};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use crate::checker::check_writability;
use crate::compute::ComputeNode;
use crate::http::JsonResponse;
use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse};
/// Check that the compute is currently running.
pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {

View File

@@ -1,16 +1,18 @@
use std::sync::Arc;
use axum::extract::State;
use axum::response::Response;
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
use axum::{extract::State, response::Response};
use compute_api::{
requests::ConfigurationRequest,
responses::{ComputeStatus, ComputeStatusResponse},
};
use http::StatusCode;
use tokio::task;
use tracing::info;
use crate::compute::{ComputeNode, ParsedSpec};
use crate::http::JsonResponse;
use crate::http::extract::Json;
use crate::{
compute::{ComputeNode, ParsedSpec},
http::{extract::Json, JsonResponse},
};
// Accept spec in JSON format and request compute configuration. If anything
// goes wrong after we set the compute status to `ConfigurationPending` and
@@ -22,7 +24,7 @@ pub(in crate::http) async fn configure(
State(compute): State<Arc<ComputeNode>>,
request: Json<ConfigurationRequest>,
) -> Response {
if !compute.params.live_config_allowed {
if !compute.live_config_allowed {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"live configuration is not allowed for this compute node".to_string(),
@@ -45,18 +47,13 @@ pub(in crate::http) async fn configure(
return JsonResponse::invalid_status(state.status);
}
// Pass the tracing span to the main thread that performs the startup,
// so that the start_compute operation is considered a child of this
// configure request for tracing purposes.
state.startup_span = Some(tracing::Span::current());
state.pspec = Some(pspec);
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
drop(state);
}
// Spawn a blocking thread to wait for compute to become Running. This is
// needed to not block the main pool of workers and to be able to serve
// needed to do not block the main pool of workers and be able to serve
// other requests while some particular request is waiting for compute to
// finish configuration.
let c = compute.clone();

View File

@@ -1,16 +1,14 @@
use std::sync::Arc;
use axum::body::Body;
use axum::extract::State;
use axum::response::Response;
use http::StatusCode;
use http::header::CONTENT_TYPE;
use axum::{body::Body, extract::State, response::Response};
use http::{header::CONTENT_TYPE, StatusCode};
use serde::Deserialize;
use crate::catalog::{SchemaDumpError, get_database_schema};
use crate::compute::ComputeNode;
use crate::http::JsonResponse;
use crate::http::extract::Query;
use crate::{
catalog::{get_database_schema, SchemaDumpError},
compute::ComputeNode,
http::{extract::Query, JsonResponse},
};
#[derive(Debug, Clone, Deserialize)]
pub(in crate::http) struct DatabaseSchemaParams {

View File

@@ -1,12 +1,9 @@
use std::sync::Arc;
use axum::extract::State;
use axum::response::Response;
use axum::{extract::State, response::Response};
use http::StatusCode;
use crate::catalog::get_dbs_and_roles;
use crate::compute::ComputeNode;
use crate::http::JsonResponse;
use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse};
/// Get the databases and roles from the compute.
pub(in crate::http) async fn get_catalog_objects(

View File

@@ -1,13 +1,19 @@
use std::sync::Arc;
use axum::extract::State;
use axum::response::{IntoResponse, Response};
use axum::{
extract::State,
response::{IntoResponse, Response},
};
use http::StatusCode;
use serde::Deserialize;
use crate::compute::ComputeNode;
use crate::http::JsonResponse;
use crate::http::extract::{Path, Query};
use crate::{
compute::ComputeNode,
http::{
extract::{Path, Query},
JsonResponse,
},
};
#[derive(Debug, Clone, Deserialize)]
pub(in crate::http) struct ExtensionServerParams {
@@ -18,11 +24,11 @@ pub(in crate::http) struct ExtensionServerParams {
/// Download a remote extension.
pub(in crate::http) async fn download_extension(
Path(filename): Path<String>,
ext_server_params: Query<ExtensionServerParams>,
params: Query<ExtensionServerParams>,
State(compute): State<Arc<ComputeNode>>,
) -> Response {
// Don't even try to download extensions if no remote storage is configured
if compute.params.ext_remote_storage.is_none() {
if compute.ext_remote_storage.is_none() {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"remote storage is not configured",
@@ -46,9 +52,9 @@ pub(in crate::http) async fn download_extension(
remote_extensions.get_ext(
&filename,
ext_server_params.is_library,
&compute.params.build_tag,
&compute.params.pgversion,
params.is_library,
&compute.build_tag,
&compute.pgversion,
)
};

View File

@@ -1,14 +1,16 @@
use std::sync::Arc;
use axum::extract::State;
use axum::response::Response;
use compute_api::requests::ExtensionInstallRequest;
use compute_api::responses::{ComputeStatus, ExtensionInstallResponse};
use axum::{extract::State, response::Response};
use compute_api::{
requests::ExtensionInstallRequest,
responses::{ComputeStatus, ExtensionInstallResponse},
};
use http::StatusCode;
use crate::compute::ComputeNode;
use crate::http::JsonResponse;
use crate::http::extract::Json;
use crate::{
compute::ComputeNode,
http::{extract::Json, JsonResponse},
};
/// Install a extension.
pub(in crate::http) async fn install_extension(

View File

@@ -17,8 +17,7 @@ pub struct FailpointConfig {
pub actions: String,
}
use crate::http::JsonResponse;
use crate::http::extract::Json;
use crate::http::{extract::Json, JsonResponse};
/// Configure failpoints for testing purposes.
pub(in crate::http) async fn configure_failpoints(

View File

@@ -1,14 +1,16 @@
use std::sync::Arc;
use axum::extract::State;
use axum::response::Response;
use compute_api::requests::SetRoleGrantsRequest;
use compute_api::responses::{ComputeStatus, SetRoleGrantsResponse};
use axum::{extract::State, response::Response};
use compute_api::{
requests::SetRoleGrantsRequest,
responses::{ComputeStatus, SetRoleGrantsResponse},
};
use http::StatusCode;
use crate::compute::ComputeNode;
use crate::http::JsonResponse;
use crate::http::extract::Json;
use crate::{
compute::ComputeNode,
http::{extract::Json, JsonResponse},
};
/// Add grants for a role.
pub(in crate::http) async fn add_grant(

View File

@@ -1,12 +1,10 @@
use std::sync::Arc;
use axum::extract::State;
use axum::response::Response;
use axum::{extract::State, response::Response};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use crate::compute::ComputeNode;
use crate::http::JsonResponse;
use crate::{compute::ComputeNode, http::JsonResponse};
/// Collect current Postgres usage insights.
pub(in crate::http) async fn get_insights(State(compute): State<Arc<ComputeNode>>) -> Response {

View File

@@ -1,12 +1,10 @@
use axum::body::Body;
use axum::response::Response;
use http::StatusCode;
use axum::{body::Body, response::Response};
use http::header::CONTENT_TYPE;
use http::StatusCode;
use metrics::proto::MetricFamily;
use metrics::{Encoder, TextEncoder};
use crate::http::JsonResponse;
use crate::metrics::collect;
use crate::{http::JsonResponse, metrics::collect};
/// Expose Prometheus metrics.
pub(in crate::http) async fn get_metrics() -> Response {

View File

@@ -1,11 +1,9 @@
use std::sync::Arc;
use axum::extract::State;
use axum::response::Response;
use axum::{extract::State, response::Response};
use http::StatusCode;
use crate::compute::ComputeNode;
use crate::http::JsonResponse;
use crate::{compute::ComputeNode, http::JsonResponse};
/// Get startup metrics.
pub(in crate::http) async fn get_metrics(State(compute): State<Arc<ComputeNode>>) -> Response {

View File

@@ -1,13 +1,9 @@
use std::ops::Deref;
use std::sync::Arc;
use std::{ops::Deref, sync::Arc};
use axum::extract::State;
use axum::http::StatusCode;
use axum::response::Response;
use axum::{extract::State, http::StatusCode, response::Response};
use compute_api::responses::ComputeStatusResponse;
use crate::compute::ComputeNode;
use crate::http::JsonResponse;
use crate::{compute::ComputeNode, http::JsonResponse};
/// Retrieve the state of the comute.
pub(in crate::http) async fn get_status(State(compute): State<Arc<ComputeNode>>) -> Response {

View File

@@ -1,14 +1,18 @@
use std::sync::Arc;
use axum::extract::State;
use axum::response::{IntoResponse, Response};
use axum::{
extract::State,
response::{IntoResponse, Response},
};
use compute_api::responses::ComputeStatus;
use http::StatusCode;
use tokio::task;
use tracing::info;
use crate::compute::{ComputeNode, forward_termination_signal};
use crate::http::JsonResponse;
use crate::{
compute::{forward_termination_signal, ComputeNode},
http::JsonResponse,
};
/// Terminate the compute.
pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {

View File

@@ -1,67 +1,60 @@
use std::fmt::Display;
use std::net::{IpAddr, Ipv6Addr, SocketAddr};
use std::sync::Arc;
use std::time::Duration;
use std::{
fmt::Display,
net::{IpAddr, Ipv6Addr, SocketAddr},
sync::Arc,
time::Duration,
};
use anyhow::Result;
use axum::Router;
use axum::extract::Request;
use axum::middleware::{self, Next};
use axum::response::{IntoResponse, Response};
use axum::routing::{get, post};
use axum::{
extract::Request,
middleware::{self, Next},
response::{IntoResponse, Response},
routing::{get, post},
Router,
};
use http::StatusCode;
use jsonwebtoken::jwk::JwkSet;
use tokio::net::TcpListener;
use tower::ServiceBuilder;
use tower_http::{
auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
};
use tracing::{Span, error, info};
use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer};
use tracing::{debug, error, info, Span};
use uuid::Uuid;
use super::{
headers::X_REQUEST_ID,
middleware::authorize::Authorize,
routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, insights, metrics, metrics_json, status, terminate,
},
use super::routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, insights, metrics, metrics_json, status, terminate,
};
use crate::compute::ComputeNode;
const X_REQUEST_ID: &str = "x-request-id";
/// `compute_ctl` has two servers: internal and external. The internal server
/// binds to the loopback interface and handles communication from clients on
/// the compute. The external server is what receives communication from the
/// control plane, the metrics scraper, etc. We make the distinction because
/// certain routes in `compute_ctl` only need to be exposed to local processes
/// like Postgres via the neon extension and local_proxy.
#[derive(Clone, Debug)]
#[derive(Clone, Copy, Debug)]
pub enum Server {
Internal {
port: u16,
},
External {
port: u16,
jwks: JwkSet,
compute_id: String,
},
Internal(u16),
External(u16),
}
impl Display for Server {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Server::Internal { .. } => f.write_str("internal"),
Server::External { .. } => f.write_str("external"),
Server::Internal(_) => f.write_str("internal"),
Server::External(_) => f.write_str("external"),
}
}
}
impl From<&Server> for Router<Arc<ComputeNode>> {
fn from(server: &Server) -> Self {
impl From<Server> for Router<Arc<ComputeNode>> {
fn from(server: Server) -> Self {
let mut router = Router::<Arc<ComputeNode>>::new();
router = match server {
Server::Internal { .. } => {
Server::Internal(_) => {
router = router
.route(
"/extension_server/{*filename}",
@@ -79,71 +72,58 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
router
}
Server::External {
jwks, compute_id, ..
} => {
let unauthenticated_router =
Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate))
.layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
compute_id.clone(),
jwks.clone(),
)));
router
.merge(unauthenticated_router)
.merge(authenticated_router)
}
Server::External(_) => router
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
.route("/metrics", get(metrics::get_metrics))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate)),
};
router
.fallback(Server::handle_404)
.method_not_allowed_fallback(Server::handle_405)
.layer(
ServiceBuilder::new()
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
// Add this middleware since we assume the request ID exists
.layer(middleware::from_fn(maybe_add_request_id_header))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {
let request_id = request
router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
ServiceBuilder::new()
// Add this middleware since we assume the request ID exists
.layer(middleware::from_fn(maybe_add_request_id_header))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {
let request_id = request
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
match request.uri().path() {
"/metrics" => {
debug!(%request_id, "{} {}", request.method(), request.uri())
}
_ => info!(%request_id, "{} {}", request.method(), request.uri()),
};
})
.on_response(
|response: &http::Response<_>, latency: Duration, _span: &Span| {
let request_id = response
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
info!(%request_id, "{} {}", request.method(), request.uri());
})
.on_response(
|response: &http::Response<_>, latency: Duration, _span: &Span| {
let request_id = response
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
info!(
%request_id,
code = response.status().as_u16(),
latency = latency.as_millis()
);
},
),
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
info!(
%request_id,
code = response.status().as_u16(),
latency = latency.as_millis()
)
},
),
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
}
}
@@ -167,15 +147,15 @@ impl Server {
match self {
// TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
// allow binding to localhost
Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
}
}
fn port(&self) -> u16 {
fn port(self) -> u16 {
match self {
Server::Internal { port, .. } => *port,
Server::External { port, .. } => *port,
Server::Internal(port) => port,
Server::External(port) => port,
}
}
@@ -202,9 +182,7 @@ impl Server {
);
}
let router = Router::from(&self)
.with_state(compute)
.into_make_service_with_connect_info::<SocketAddr>();
let router = Router::from(self).with_state(compute);
if let Err(e) = axum::serve(listener, router).await {
error!("compute_ctl {} HTTP server error: {}", self, e);

View File

@@ -1,7 +1,7 @@
use compute_api::responses::{InstalledExtension, InstalledExtensions};
use std::collections::HashMap;
use anyhow::Result;
use compute_api::responses::{InstalledExtension, InstalledExtensions};
use postgres::{Client, NoTls};
use crate::metrics::INSTALLED_EXTENSIONS;

View File

@@ -11,7 +11,6 @@ pub mod http;
pub mod logger;
pub mod catalog;
pub mod compute;
pub mod disk_quota;
pub mod extension_server;
pub mod installed_extensions;
pub mod local_proxy;
@@ -19,9 +18,9 @@ pub mod lsn_lease;
pub mod metrics;
mod migration;
pub mod monitor;
pub mod neonvmd_client;
pub mod params;
pub mod pg_helpers;
pub mod spec;
mod spec_apply;
pub mod swap;
pub mod sync_sk;

View File

@@ -1,5 +1,3 @@
use std::collections::HashMap;
use tracing::info;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
@@ -44,50 +42,3 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
pub fn inlinify(s: &str) -> String {
s.replace('\n', "\u{200B}")
}
pub fn startup_context_from_env() -> Option<opentelemetry::Context> {
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
//
// This is used to propagate the context for the 'start_compute' operation
// from the neon control plane. This allows linking together the wider
// 'start_compute' operation that creates the compute container, with the
// startup actions here within the container.
//
// There is no standard for passing context in env variables, but a lot of
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
//
// Switch to the startup context here, and exit it once the startup has
// completed and Postgres is up and running.
//
// If this pod is pre-created without binding it to any particular endpoint
// yet, this isn't the right place to enter the startup context. In that
// case, the control plane should pass the tracing context as part of the
// /configure API call.
//
// NOTE: This is supposed to only cover the *startup* actions. Once
// postgres is configured and up-and-running, we exit this span. Any other
// actions that are performed on incoming HTTP requests, for example, are
// performed in separate spans.
//
// XXX: If the pod is restarted, we perform the startup actions in the same
// context as the original startup actions, which probably doesn't make
// sense.
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
if let Ok(val) = std::env::var("TRACEPARENT") {
startup_tracing_carrier.insert("traceparent".to_string(), val);
}
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry_sdk::propagation::TraceContextPropagator;
info!("got startup tracing context from env variables");
Some(TraceContextPropagator::new().extract(&startup_tracing_carrier))
} else {
None
}
}

View File

@@ -1,15 +1,17 @@
use std::str::FromStr;
use std::sync::Arc;
use std::thread;
use std::time::{Duration, SystemTime};
use anyhow::{Result, bail};
use compute_api::spec::ComputeMode;
use anyhow::bail;
use anyhow::Result;
use postgres::{NoTls, SimpleQueryMessage};
use std::time::SystemTime;
use std::{str::FromStr, sync::Arc, thread, time::Duration};
use utils::id::TenantId;
use utils::id::TimelineId;
use compute_api::spec::ComputeMode;
use tracing::{info, warn};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
use utils::{
lsn::Lsn,
shard::{ShardCount, ShardNumber, TenantShardId},
};
use crate::compute::ComputeNode;

View File

@@ -1,6 +1,6 @@
use metrics::core::Collector;
use metrics::proto::MetricFamily;
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec};
use once_cell::sync::Lazy;
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {

View File

@@ -1,14 +1,13 @@
use std::sync::Arc;
use std::thread;
use std::time::Duration;
use std::{thread, time::Duration};
use chrono::{DateTime, Utc};
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeFeature;
use postgres::{Client, NoTls};
use tracing::{debug, error, info, warn};
use crate::compute::ComputeNode;
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeFeature;
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
@@ -18,7 +17,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
// should be handled gracefully.
fn watch_compute_activity(compute: &ComputeNode) {
// Suppose that `connstr` doesn't change
let connstr = compute.params.connstr.clone();
let connstr = compute.connstr.clone();
let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
// During startup and configuration we connect to every Postgres database,

View File

@@ -0,0 +1,102 @@
use anyhow::Context;
use hyper::client::conn;
use hyper::client::conn::http1::SendRequest;
use hyper::{Request, StatusCode};
use hyper_util::rt::TokioIo;
use tracing::warn;
const NEONVM_DAEMON_CONTROL_SOCKET_PATH: &str = "/run/neonvm-daemon-socket";
/// Open a connection to neonvm-daemon's control socket, prepare to send
/// requests to it with hyper.
async fn connect_neonvm_daemon<B>() -> anyhow::Result<SendRequest<B>>
where
B: hyper::body::Body + 'static + Send,
B::Data: Send,
B::Error: Into<Box<dyn std::error::Error + Send + Sync>>,
{
let mut attempts = 0;
let stream = loop {
match tokio::net::UnixStream::connect(NEONVM_DAEMON_CONTROL_SOCKET_PATH).await {
Ok(stream) => break stream,
Err(err) if err.kind() == std::io::ErrorKind::NotFound && attempts < 50 => {
// Retry
warn!("neonvm-daemon control socket not found, retrying...");
attempts += 1;
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
Err(err) => Err(err).context("opening neonvm-daemon control socket")?,
}
};
let io = TokioIo::new(stream);
let (request_sender, connection) = conn::http1::handshake(io).await.unwrap();
// spawn a task to poll the connection and drive the HTTP state
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("Error in connection: {}", e);
}
});
Ok(request_sender)
}
pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
let rt = tokio::runtime::Handle::current();
rt.block_on(resize_swap_async(size_bytes))
}
pub async fn resize_swap_async(size_bytes: u64) -> anyhow::Result<()> {
let mut neonvmd = connect_neonvm_daemon().await?;
// Passing 'once' causes neonvm-daemon to reject any future resize requests
let request = Request::builder()
.method("POST")
.uri("/resize-swap-once")
.header("Host", "localhost") // hyper requires Host, even though the server won't care
.body(format!("{}", size_bytes))
.unwrap();
let resp = neonvmd.send_request(request).await?;
let status = resp.status();
match status {
StatusCode::OK => Ok(()),
StatusCode::CONFLICT => {
// 409 Conflict means that the swap was already resized. That happens if the
// compute_ctl restarts within the VM. That's considered OK.
warn!("Swap was already resized");
Ok(())
}
_ => Err(anyhow::anyhow!(
"error resizing swap: {}",
status.to_string()
)),
}
}
pub fn set_disk_quota(size_bytes: u64) -> anyhow::Result<()> {
let rt = tokio::runtime::Handle::current();
rt.block_on(set_disk_quota_async(size_bytes))
}
/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
pub async fn set_disk_quota_async(size_bytes: u64) -> anyhow::Result<()> {
let mut neonvmd = connect_neonvm_daemon().await?;
let request = Request::builder()
.method("POST")
.uri("/set-disk-quota")
.header("Host", "localhost") // hyper requires Host, even though the server won't care
.body(format!("{}", size_bytes))
.unwrap();
let resp = neonvmd.send_request(request).await?;
let status = resp.status();
match status {
StatusCode::OK => Ok(()),
_ => Err(anyhow::anyhow!(
"error setting disk quota: {}",
status.to_string()
)),
}
}

View File

@@ -9,8 +9,7 @@ use std::process::Child;
use std::str::FromStr;
use std::time::{Duration, Instant};
use anyhow::{Result, bail};
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
use anyhow::{bail, Result};
use futures::StreamExt;
use ini::Ini;
use notify::{RecursiveMode, Watcher};
@@ -22,6 +21,8 @@ use tokio_postgres;
use tokio_postgres::NoTls;
use tracing::{debug, error, info, instrument};
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
/// Escape a string for including it in a SQL literal.

View File

@@ -1,21 +1,21 @@
use anyhow::{anyhow, bail, Result};
use reqwest::StatusCode;
use std::fs::File;
use std::path::Path;
use anyhow::{Result, anyhow, bail};
use compute_api::responses::{
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
};
use compute_api::spec::ComputeSpec;
use reqwest::StatusCode;
use tokio_postgres::Client;
use tracing::{error, info, instrument, warn};
use crate::config;
use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS};
use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
use crate::migration::MigrationRunner;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
use compute_api::responses::{
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
};
use compute_api::spec::ComputeSpec;
// Do control plane request and return response if any. In case of error it
// returns a bool flag indicating whether it makes sense to retry the request
// and a string with error message.
@@ -141,6 +141,7 @@ pub fn get_spec_from_control_plane(
/// Check `pg_hba.conf` and update if needed to allow external connections.
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
// XXX: consider making it a part of spec.json
info!("checking pg_hba.conf");
let pghba_path = pgdata_path.join("pg_hba.conf");
if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
@@ -155,11 +156,12 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
/// Create a standby.signal file
pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
// XXX: consider making it a part of spec.json
info!("adding standby.signal");
let signalfile = pgdata_path.join("standby.signal");
if !signalfile.exists() {
File::create(signalfile)?;
info!("created standby.signal");
File::create(signalfile)?;
} else {
info!("reused pre-existing standby.signal");
}
@@ -168,6 +170,7 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
#[instrument(skip_all)]
pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
info!("handle neon extension upgrade");
let query = "ALTER EXTENSION neon UPDATE";
info!("update neon extension version with query: {}", query);
client.simple_query(query).await?;

View File

@@ -1,416 +1,18 @@
use std::collections::{HashMap, HashSet};
use std::fmt::{Debug, Formatter};
use std::future::Future;
use std::iter::{empty, once};
use std::iter::empty;
use std::iter::once;
use std::sync::Arc;
use anyhow::{Context, Result};
use compute_api::responses::ComputeStatus;
use crate::compute::construct_superuser_query;
use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt};
use anyhow::Result;
use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
use futures::future::join_all;
use tokio::sync::RwLock;
use tokio_postgres::Client;
use tokio_postgres::error::SqlState;
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
use crate::compute::{ComputeNode, ComputeState, construct_superuser_query};
use crate::pg_helpers::{
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async,
get_existing_roles_async,
};
use crate::spec_apply::ApplySpecPhase::{
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
RunInEachDatabase,
};
use crate::spec_apply::PerDatabasePhase::{
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
};
impl ComputeNode {
/// Apply the spec to the running PostgreSQL instance.
/// The caller can decide to run with multiple clients in parallel, or
/// single mode. Either way, the commands executed will be the same, and
/// only commands run in different databases are parallelized.
#[instrument(skip_all)]
pub fn apply_spec_sql(
&self,
spec: Arc<ComputeSpec>,
conf: Arc<tokio_postgres::Config>,
concurrency: usize,
) -> Result<()> {
info!("Applying config with max {} concurrency", concurrency);
debug!("Config: {:?}", spec);
let rt = tokio::runtime::Handle::current();
rt.block_on(async {
// Proceed with post-startup configuration. Note, that order of operations is important.
let client = Self::get_maintenance_client(&conf).await?;
let spec = spec.clone();
let databases = get_existing_dbs_async(&client).await?;
let roles = get_existing_roles_async(&client)
.await?
.into_iter()
.map(|role| (role.name.clone(), role))
.collect::<HashMap<String, Role>>();
// Check if we need to drop subscriptions before starting the endpoint.
//
// It is important to do this operation exactly once when endpoint starts on a new branch.
// Otherwise, we may drop not inherited, but newly created subscriptions.
//
// We cannot rely only on spec.drop_subscriptions_before_start flag,
// because if for some reason compute restarts inside VM,
// it will start again with the same spec and flag value.
//
// To handle this, we save the fact of the operation in the database
// in the neon.drop_subscriptions_done table.
// If the table does not exist, we assume that the operation was never performed, so we must do it.
// If table exists, we check if the operation was performed on the current timelilne.
//
let mut drop_subscriptions_done = false;
if spec.drop_subscriptions_before_start {
let timeline_id = self.get_timeline_id().context("timeline_id must be set")?;
let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id);
info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
drop_subscriptions_done = match
client.simple_query(&query).await {
Ok(result) => {
matches!(&result[0], postgres::SimpleQueryMessage::Row(_))
},
Err(e) =>
{
match e.code() {
Some(&SqlState::UNDEFINED_TABLE) => false,
_ => {
// We don't expect any other error here, except for the schema/table not existing
error!("Error checking if drop subscription operation was already performed: {}", e);
return Err(e.into());
}
}
}
}
};
let jwks_roles = Arc::new(
spec.as_ref()
.local_proxy_config
.iter()
.flat_map(|it| &it.jwks)
.flatten()
.flat_map(|setting| &setting.role_names)
.cloned()
.collect::<HashSet<_>>(),
);
let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext {
roles,
dbs: databases,
}));
// Apply special pre drop database phase.
// NOTE: we use the code of RunInEachDatabase phase for parallelism
// and connection management, but we don't really run it in *each* database,
// only in databases, we're about to drop.
info!("Applying PerDatabase (pre-dropdb) phase");
let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
// Run the phase for each database that we're about to drop.
let db_processes = spec
.delta_operations
.iter()
.flatten()
.filter_map(move |op| {
if op.action.as_str() == "delete_db" {
Some(op.name.clone())
} else {
None
}
})
.map(|dbname| {
let spec = spec.clone();
let ctx = ctx.clone();
let jwks_roles = jwks_roles.clone();
let mut conf = conf.as_ref().clone();
let concurrency_token = concurrency_token.clone();
// We only need dbname field for this phase, so set other fields to dummy values
let db = DB::UserDB(Database {
name: dbname.clone(),
owner: "cloud_admin".to_string(),
options: None,
restrict_conn: false,
invalid: false,
});
debug!("Applying per-database phases for Database {:?}", &db);
match &db {
DB::SystemDB => {}
DB::UserDB(db) => {
conf.dbname(db.name.as_str());
}
}
let conf = Arc::new(conf);
let fut = Self::apply_spec_sql_db(
spec.clone(),
conf,
ctx.clone(),
jwks_roles.clone(),
concurrency_token.clone(),
db,
[DropLogicalSubscriptions].to_vec(),
);
Ok(tokio::spawn(fut))
})
.collect::<Vec<Result<_, anyhow::Error>>>();
for process in db_processes.into_iter() {
let handle = process?;
if let Err(e) = handle.await? {
// Handle the error case where the database does not exist
// We do not check whether the DB exists or not in the deletion phase,
// so we shouldn't be strict about it in pre-deletion cleanup as well.
if e.to_string().contains("does not exist") {
warn!("Error dropping subscription: {}", e);
} else {
return Err(e);
}
};
}
for phase in [
CreateSuperUser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
RenameAndDeleteDatabases,
CreateAndAlterDatabases,
CreateSchemaNeon,
] {
info!("Applying phase {:?}", &phase);
apply_operations(
spec.clone(),
ctx.clone(),
jwks_roles.clone(),
phase,
|| async { Ok(&client) },
)
.await?;
}
info!("Applying RunInEachDatabase2 phase");
let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
let db_processes = spec
.cluster
.databases
.iter()
.map(|db| DB::new(db.clone()))
// include
.chain(once(DB::SystemDB))
.map(|db| {
let spec = spec.clone();
let ctx = ctx.clone();
let jwks_roles = jwks_roles.clone();
let mut conf = conf.as_ref().clone();
let concurrency_token = concurrency_token.clone();
let db = db.clone();
debug!("Applying per-database phases for Database {:?}", &db);
match &db {
DB::SystemDB => {}
DB::UserDB(db) => {
conf.dbname(db.name.as_str());
}
}
let conf = Arc::new(conf);
let mut phases = vec![
DeleteDBRoleReferences,
ChangeSchemaPerms,
HandleAnonExtension,
];
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
phases.push(DropLogicalSubscriptions);
}
let fut = Self::apply_spec_sql_db(
spec.clone(),
conf,
ctx.clone(),
jwks_roles.clone(),
concurrency_token.clone(),
db,
phases,
);
Ok(tokio::spawn(fut))
})
.collect::<Vec<Result<_, anyhow::Error>>>();
for process in db_processes.into_iter() {
let handle = process?;
handle.await??;
}
let mut phases = vec![
HandleOtherExtensions,
HandleNeonExtension, // This step depends on CreateSchemaNeon
CreateAvailabilityCheck,
DropRoles,
];
// This step depends on CreateSchemaNeon
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
phases.push(FinalizeDropLogicalSubscriptions);
}
for phase in phases {
debug!("Applying phase {:?}", &phase);
apply_operations(
spec.clone(),
ctx.clone(),
jwks_roles.clone(),
phase,
|| async { Ok(&client) },
)
.await?;
}
Ok::<(), anyhow::Error>(())
})?;
Ok(())
}
/// Apply SQL migrations of the RunInEachDatabase phase.
///
/// May opt to not connect to databases that don't have any scheduled
/// operations. The function is concurrency-controlled with the provided
/// semaphore. The caller has to make sure the semaphore isn't exhausted.
async fn apply_spec_sql_db(
spec: Arc<ComputeSpec>,
conf: Arc<tokio_postgres::Config>,
ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>,
jwks_roles: Arc<HashSet<String>>,
concurrency_token: Arc<tokio::sync::Semaphore>,
db: DB,
subphases: Vec<PerDatabasePhase>,
) -> Result<()> {
let _permit = concurrency_token.acquire().await?;
let mut client_conn = None;
for subphase in subphases {
apply_operations(
spec.clone(),
ctx.clone(),
jwks_roles.clone(),
RunInEachDatabase {
db: db.clone(),
subphase,
},
// Only connect if apply_operation actually wants a connection.
// It's quite possible this database doesn't need any queries,
// so by not connecting we save time and effort connecting to
// that database.
|| async {
if client_conn.is_none() {
let db_client = Self::get_maintenance_client(&conf).await?;
client_conn.replace(db_client);
}
let client = client_conn.as_ref().unwrap();
Ok(client)
},
)
.await?;
}
drop(client_conn);
Ok::<(), anyhow::Error>(())
}
/// Choose how many concurrent connections to use for applying the spec changes.
pub fn max_service_connections(
&self,
compute_state: &ComputeState,
spec: &ComputeSpec,
) -> usize {
// If the cluster is in Init state we don't have to deal with user connections,
// and can thus use all `max_connections` connection slots. However, that's generally not
// very efficient, so we generally still limit it to a smaller number.
if compute_state.status == ComputeStatus::Init {
// If the settings contain 'max_connections', use that as template
if let Some(config) = spec.cluster.settings.find("max_connections") {
config.parse::<usize>().ok()
} else {
// Otherwise, try to find the setting in the postgresql_conf string
spec.cluster
.postgresql_conf
.iter()
.flat_map(|conf| conf.split("\n"))
.filter_map(|line| {
if !line.contains("max_connections") {
return None;
}
let (key, value) = line.split_once("=")?;
let key = key
.trim_start_matches(char::is_whitespace)
.trim_end_matches(char::is_whitespace);
let value = value
.trim_start_matches(char::is_whitespace)
.trim_end_matches(char::is_whitespace);
if key != "max_connections" {
return None;
}
value.parse::<usize>().ok()
})
.next()
}
// If max_connections is present, use at most 1/3rd of that.
// When max_connections is lower than 30, try to use at least 10 connections, but
// never more than max_connections.
.map(|limit| match limit {
0..10 => limit,
10..30 => 10,
30.. => limit / 3,
})
// If we didn't find max_connections, default to 10 concurrent connections.
.unwrap_or(10)
} else {
// state == Running
// Because the cluster is already in the Running state, we should assume users are
// already connected to the cluster, and high concurrency could negatively
// impact user connectivity. Therefore, we can limit concurrency to the number of
// reserved superuser connections, which users wouldn't be able to use anyway.
spec.cluster
.settings
.find("superuser_reserved_connections")
.iter()
.filter_map(|val| val.parse::<usize>().ok())
.map(|val| if val > 1 { val - 1 } else { 1 })
.last()
.unwrap_or(3)
}
}
}
use tracing::{debug, info_span, warn, Instrument};
#[derive(Clone)]
pub enum DB {
@@ -872,10 +474,7 @@ async fn get_operations<'a>(
let edb = match databases.get(&db.name) {
Some(edb) => edb,
None => {
warn!(
"skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL",
subphase, db.name
);
warn!("skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", subphase, db.name);
return Ok(Box::new(empty()));
}
};

View File

@@ -1,46 +0,0 @@
use std::path::Path;
use anyhow::{Context, anyhow};
use tracing::{instrument, warn};
pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
#[instrument]
pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
// run `/neonvm/bin/resize-swap --once {size_bytes}`
//
// Passing '--once' causes resize-swap to delete itself after successful completion, which
// means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
// postgres is running.
//
// NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
let child_result = std::process::Command::new("/usr/bin/sudo")
.arg(RESIZE_SWAP_BIN)
.arg("--once")
.arg(size_bytes.to_string())
.spawn();
child_result
.context("spawn() failed")
.and_then(|mut child| child.wait().context("wait() failed"))
.and_then(|status| match status.success() {
true => Ok(()),
false => {
// The command failed. Maybe it was because the resize-swap file doesn't exist?
// The --once flag causes it to delete itself on success so we don't disable swap
// while postgres is running; maybe this is fine.
match Path::new(RESIZE_SWAP_BIN).try_exists() {
Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
// The path doesn't exist; we're actually ok
Ok(false) => {
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
Ok(())
},
}
}
})
// wrap any prior error with the overall context that we couldn't run the command
.with_context(|| {
format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
})
}

View File

@@ -1,7 +1,7 @@
#[cfg(test)]
mod config_tests {
use std::fs::{File, remove_file};
use std::fs::{remove_file, File};
use std::io::{Read, Write};
use std::path::Path;

View File

@@ -25,7 +25,7 @@ use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use nix::errno::Errno;
use nix::fcntl::{FcntlArg, FdFlag};
use nix::sys::signal::{Signal, kill};
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use utils::pid_file::{self, PidFileRead};

View File

@@ -5,16 +5,7 @@
//! easier to work with locally. The python tests in `test_runner`
//! rely on `neon_local` to set up the environment for each test.
//!
use std::borrow::Cow;
use std::collections::{BTreeSet, HashMap};
use std::fs::File;
use std::os::fd::AsRawFd;
use std::path::PathBuf;
use std::process::exit;
use std::str::FromStr;
use std::time::Duration;
use anyhow::{Context, Result, anyhow, bail};
use anyhow::{anyhow, bail, Context, Result};
use clap::Parser;
use compute_api::spec::ComputeMode;
use control_plane::endpoint::ComputeControlPlane;
@@ -28,7 +19,7 @@ use control_plane::storage_controller::{
NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
};
use control_plane::{broker, local_env};
use nix::fcntl::{FlockArg, flock};
use nix::fcntl::{flock, FlockArg};
use pageserver_api::config::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
@@ -40,18 +31,27 @@ use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInf
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use safekeeper_api::membership::SafekeeperGeneration;
use safekeeper_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
};
use std::borrow::Cow;
use std::collections::{BTreeSet, HashMap};
use std::fs::File;
use std::os::fd::AsRawFd;
use std::path::PathBuf;
use std::process::exit;
use std::str::FromStr;
use std::time::Duration;
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
use tokio::task::JoinSet;
use url::Host;
use utils::auth::{Claims, Scope};
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
use utils::lsn::Lsn;
use utils::project_git_version;
use utils::{
auth::{Claims, Scope},
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
lsn::Lsn,
project_git_version,
};
// Default id of a safekeeper node, if not specified on the command line.
const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1);
@@ -597,15 +597,7 @@ struct EndpointStartCmdArgs {
#[clap(long = "pageserver-id")]
endpoint_pageserver_id: Option<NodeId>,
#[clap(
long,
help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
)]
safekeepers_generation: Option<u32>,
#[clap(
long,
help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
)]
#[clap(long)]
safekeepers: Option<String>,
#[clap(
@@ -626,9 +618,9 @@ struct EndpointStartCmdArgs {
)]
allow_multiple: bool,
#[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
#[arg(default_value = "90s")]
start_timeout: Duration,
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
}
#[derive(clap::Args)]
@@ -895,6 +887,20 @@ fn print_timeline(
Ok(())
}
/// Returns a map of timeline IDs to timeline_id@lsn strings.
/// Connects to the pageserver to query this information.
async fn get_timeline_infos(
env: &local_env::LocalEnv,
tenant_shard_id: &TenantShardId,
) -> Result<HashMap<TimelineId, TimelineInfo>> {
Ok(get_default_pageserver(env)
.timeline_list(tenant_shard_id)
.await?
.into_iter()
.map(|timeline_info| (timeline_info.timeline_id, timeline_info))
.collect())
}
/// Helper function to get tenant id from an optional --tenant_id option or from the config file
fn get_tenant_id(
tenant_id_arg: Option<TenantId>,
@@ -929,9 +935,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
let init_conf: NeonLocalInitConf = if let Some(config_path) = &args.config {
// User (likely the Python test suite) provided a description of the environment.
if args.num_pageservers.is_some() {
bail!(
"Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"
);
bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
}
// load and parse the file
let contents = std::fs::read_to_string(config_path).with_context(|| {
@@ -1247,6 +1251,12 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
// where shard 0 is attached, and query there.
let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?;
let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
.await
.unwrap_or_else(|e| {
eprintln!("Failed to load timeline info: {}", e);
HashMap::new()
});
let timeline_name_mappings = env.timeline_name_mappings();
@@ -1275,9 +1285,12 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
lsn.to_string()
}
_ => {
// As the LSN here refers to the one that the compute is started with,
// we display nothing as it is a primary/hot standby compute.
"---".to_string()
// -> primary endpoint or hot replica
// Use the LSN at the end of the timeline.
timeline_infos
.get(&endpoint.timeline_id)
.map(|bi| bi.last_record_lsn.to_string())
.unwrap_or_else(|| "?".to_string())
}
};
@@ -1325,14 +1338,10 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
match (mode, args.hot_standby) {
(ComputeMode::Static(_), true) => {
bail!(
"Cannot start a node in hot standby mode when it is already configured as a static replica"
)
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
}
(ComputeMode::Primary, true) => {
bail!(
"Cannot start a node as a hot standby replica, it is already configured as primary node"
)
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
}
_ => {}
}
@@ -1359,7 +1368,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
let pageserver_id = args.endpoint_pageserver_id;
let remote_ext_config = &args.remote_ext_config;
let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
// If --safekeepers argument is given, use only the listed
// safekeeper nodes; otherwise all from the env.
let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
@@ -1435,13 +1443,11 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
endpoint
.start(
&auth_token,
safekeepers_generation,
safekeepers,
pageservers,
remote_ext_config.as_ref(),
stripe_size.0 as usize,
args.create_test_user,
args.start_timeout,
)
.await?;
}

View File

@@ -8,6 +8,7 @@
use std::time::Duration;
use anyhow::Context;
use camino::Utf8PathBuf;
use crate::{background_process, local_env};

View File

@@ -37,23 +37,29 @@
//! ```
//!
use std::collections::BTreeMap;
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
use std::net::IpAddr;
use std::net::Ipv4Addr;
use std::net::SocketAddr;
use std::net::TcpStream;
use std::path::PathBuf;
use std::process::Command;
use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use std::time::Duration;
use std::time::SystemTime;
use std::time::UNIX_EPOCH;
use anyhow::{Context, Result, anyhow, bail};
use anyhow::{anyhow, bail, Context, Result};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
use compute_api::spec::{
Cluster, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role,
};
use nix::sys::signal::{Signal, kill};
use compute_api::responses::ComputeCtlConfig;
use compute_api::spec::Database;
use compute_api::spec::PgIdent;
use compute_api::spec::RemoteExtSpec;
use compute_api::spec::Role;
use nix::sys::signal::kill;
use nix::sys::signal::Signal;
use pageserver_api::shard::ShardStripeSize;
use reqwest::header::CONTENT_TYPE;
use safekeeper_api::membership::SafekeeperGeneration;
use serde::{Deserialize, Serialize};
use tracing::debug;
use url::Host;
@@ -63,6 +69,9 @@ use crate::local_env::LocalEnv;
use crate::postgresql_conf::PostgresConf;
use crate::storage_controller::StorageController;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
// contents of a endpoint.json file
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct EndpointConf {
@@ -228,9 +237,7 @@ impl ComputeControlPlane {
});
if let Some((key, _)) = duplicates.next() {
bail!(
"attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported."
);
bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
}
}
Ok(())
@@ -577,17 +584,14 @@ impl Endpoint {
Ok(safekeeper_connstrings)
}
#[allow(clippy::too_many_arguments)]
pub async fn start(
&self,
auth_token: &Option<String>,
safekeepers_generation: Option<SafekeeperGeneration>,
safekeepers: Vec<NodeId>,
pageservers: Vec<(Host, u16)>,
remote_ext_config: Option<&String>,
shard_stripe_size: usize,
create_test_user: bool,
start_timeout: Duration,
) -> Result<()> {
if self.status() == EndpointStatus::Running {
anyhow::bail!("The endpoint is already running");
@@ -659,7 +663,6 @@ impl Endpoint {
timeline_id: Some(self.timeline_id),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
@@ -775,18 +778,17 @@ impl Endpoint {
std::fs::write(pidfile_path, pid.to_string())?;
// Wait for it to start
let mut attempt = 0;
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
let start_at = Instant::now();
const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
loop {
attempt += 1;
match self.get_status().await {
Ok(state) => {
match state.status {
ComputeStatus::Init => {
if Instant::now().duration_since(start_at) > start_timeout {
bail!(
"compute startup timed out {:?}; still in Init state",
start_timeout
);
if attempt == MAX_ATTEMPTS {
bail!("compute startup timed out; still in Init state");
}
// keep retrying
}
@@ -813,11 +815,8 @@ impl Endpoint {
}
}
Err(e) => {
if Instant::now().duration_since(start_at) > start_timeout {
return Err(e).context(format!(
"timed out {:?} waiting to connect to compute_ctl HTTP",
start_timeout,
));
if attempt == MAX_ATTEMPTS {
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
}
}
}

View File

@@ -3,22 +3,28 @@
//! Now it also provides init method which acts like a stub for proper installation
//! script which will use local paths.
use std::collections::HashMap;
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::time::Duration;
use std::{env, fs};
use anyhow::{bail, Context};
use anyhow::{Context, bail};
use clap::ValueEnum;
use postgres_backend::AuthType;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use utils::auth::{Claims, encode_from_key_file};
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
use std::collections::HashMap;
use std::env;
use std::fs;
use std::net::IpAddr;
use std::net::Ipv4Addr;
use std::net::SocketAddr;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::time::Duration;
use utils::{
auth::{encode_from_key_file, Claims},
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
};
use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
use crate::pageserver::PageServerNode;
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
use crate::safekeeper::SafekeeperNode;
pub const DEFAULT_PG_VERSION: u32 = 16;
@@ -459,9 +465,7 @@ impl LocalEnv {
if old_timeline_id == &timeline_id {
Ok(())
} else {
bail!(
"branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"
);
bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
}
} else {
existing_values.push((tenant_id, timeline_id));

View File

@@ -7,6 +7,7 @@
//! ```
//!
use std::collections::HashMap;
use std::io;
use std::io::Write;
use std::num::NonZeroU64;
@@ -14,19 +15,22 @@ use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
use anyhow::{Context, bail};
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
use postgres_connection::{PgConnectionConfig, parse_host_port};
use postgres_connection::{parse_host_port, PgConnectionConfig};
use utils::auth::{Claims, Scope};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::id::NodeId;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::background_process;
use crate::local_env::{LocalEnv, NeonLocalInitPageserverConf, PageServerConf};
use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf};
use crate::{background_process, local_env::LocalEnv};
/// Directory within .neon which will be used by default for LocalFs remote storage.
pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
@@ -77,11 +81,7 @@ impl PageServerNode {
&self,
conf: NeonLocalInitPageserverConf,
) -> anyhow::Result<toml_edit::DocumentMut> {
assert_eq!(
&PageServerConf::from(&conf),
&self.conf,
"during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"
);
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)

View File

@@ -1,6 +1,3 @@
use std::collections::HashMap;
use std::fmt;
///
/// Module for parsing postgresql.conf file.
///
@@ -9,6 +6,8 @@ use std::fmt;
/// funny stuff like include-directives or funny escaping.
use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::HashMap;
use std::fmt;
/// In-memory representation of a postgresql.conf file
#[derive(Default, Debug)]

View File

@@ -14,15 +14,18 @@ use std::{io, result};
use anyhow::Context;
use camino::Utf8PathBuf;
use http_utils::error::HttpErrorBody;
use postgres_connection::PgConnectionConfig;
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use http_utils::error::HttpErrorBody;
use utils::auth::{Claims, Scope};
use utils::id::NodeId;
use crate::background_process;
use crate::local_env::{LocalEnv, SafekeeperConf};
use crate::{
background_process,
local_env::{LocalEnv, SafekeeperConf},
};
#[derive(Error, Debug)]
pub enum SafekeeperHttpError {

View File

@@ -1,39 +1,44 @@
use std::ffi::OsStr;
use std::fs;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::process::ExitStatus;
use std::str::FromStr;
use std::sync::OnceLock;
use std::time::{Duration, Instant};
use crate::{
background_process,
local_env::{LocalEnv, NeonStorageControllerConf},
};
use camino::{Utf8Path, Utf8PathBuf};
use hyper0::Uri;
use nix::unistd::Pid;
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
TenantShardMigrateResponse,
use pageserver_api::{
controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
TenantShardMigrateResponse,
},
models::{
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
},
shard::{ShardStripeSize, TenantShardId},
};
use pageserver_api::models::{
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use reqwest::Method;
use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize};
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{
ffi::OsStr,
fs,
net::SocketAddr,
path::PathBuf,
process::ExitStatus,
str::FromStr,
sync::OnceLock,
time::{Duration, Instant},
};
use tokio::process::Command;
use tracing::instrument;
use url::Url;
use utils::auth::{Claims, Scope, encode_from_key_file};
use utils::id::{NodeId, TenantId};
use utils::{
auth::{encode_from_key_file, Claims, Scope},
id::{NodeId, TenantId},
};
use whoami::username;
use crate::background_process;
use crate::local_env::{LocalEnv, NeonStorageControllerConf};
pub struct StorageController {
env: LocalEnv,
private_key: Option<Vec<u8>>,
@@ -91,8 +96,7 @@ pub struct AttachHookRequest {
#[derive(Serialize, Deserialize)]
pub struct AttachHookResponse {
#[serde(rename = "gen")]
pub generation: Option<u32>,
pub gen: Option<u32>,
}
#[derive(Serialize, Deserialize)]
@@ -775,7 +779,7 @@ impl StorageController {
)
.await?;
Ok(response.generation)
Ok(response.gen)
}
#[instrument(skip(self))]

View File

@@ -1,28 +1,35 @@
use std::collections::{HashMap, HashSet};
use std::str::FromStr;
use std::time::Duration;
use futures::StreamExt;
use std::{
collections::{HashMap, HashSet},
str::FromStr,
time::Duration,
};
use clap::{Parser, Subcommand};
use futures::StreamExt;
use pageserver_api::controller_api::{
AvailabilityZone, NodeAvailabilityWrapper, NodeConfigureRequest, NodeDescribeResponse,
NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse, PlacementPolicy,
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest,
TenantDescribeResponse, TenantPolicyRequest, TenantShardMigrateRequest,
TenantShardMigrateResponse,
use pageserver_api::{
controller_api::{
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy,
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
},
models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest,
TenantShardSplitRequest, TenantShardSplitResponse,
},
shard::{ShardStripeSize, TenantShardId},
};
use pageserver_api::models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters,
TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest,
TenantShardSplitResponse,
};
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
use pageserver_client::mgmt_api::{self};
use reqwest::{Method, StatusCode, Url};
use storage_controller_client::control_api::Client;
use utils::id::{NodeId, TenantId, TimelineId};
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
TenantShardMigrateRequest, TenantShardMigrateResponse,
};
use storage_controller_client::control_api::Client;
#[derive(Subcommand, Debug)]
enum Command {
/// Register a pageserver with the storage controller. This shouldn't usually be necessary,
@@ -914,9 +921,7 @@ async fn main() -> anyhow::Result<()> {
}
Command::TenantDrop { tenant_id, unclean } => {
if !unclean {
anyhow::bail!(
"This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed."
)
anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed.")
}
storcon_client
.dispatch::<(), ()>(
@@ -928,9 +933,7 @@ async fn main() -> anyhow::Result<()> {
}
Command::NodeDrop { node_id, unclean } => {
if !unclean {
anyhow::bail!(
"This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed."
)
anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed.")
}
storcon_client
.dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)

View File

@@ -186,7 +186,7 @@ services:
neon-test-extensions:
profiles: ["test-extensions"]
image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
environment:
- PGPASSWORD=cloud_admin
entrypoint:

View File

@@ -51,6 +51,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
done
if [ $pg_version -ge 16 ]; then
docker cp ext-src $TEST_CONTAINER_NAME:/
docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
echo Adding dummy config

View File

@@ -7,7 +7,7 @@ index f255fe6..0a0fa65 100644
GENERATED_SCHEDULE_DEPS = $(TB_DIR)/all_tests $(TB_DIR)/exclude_tests
REGRESS = --schedule $(TB_DIR)/run.sch # Set this again just to be safe
-REGRESS_OPTS = --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
+REGRESS_OPTS = --use-existing --dbname=contrib_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
+REGRESS_OPTS = --use-existing --dbname=pgtap_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
SETUP_SCH = test/schedule/main.sch # schedule to use for test setup; this can be forcibly changed by some targets!
IGNORE_TESTS = $(notdir $(EXCLUDE_TEST_FILES:.sql=))
PARALLEL_TESTS = $(filter-out $(IGNORE_TESTS),$(filter-out $(SERIAL_TESTS),$(ALL_TESTS)))

View File

@@ -6,16 +6,12 @@ generate_id() {
local -n resvar=$1
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
}
echo "${OLD_COMPUTE_TAG}"
echo "${NEW_COMPUTE_TAG}"
echo "${TEST_EXTENSIONS_TAG}"
if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then
echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set
if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then
echo OLDTAG and NEWTAG must be defined
exit 1
fi
export PG_VERSION=${PG_VERSION:-16}
export PG_TEST_VERSION=${PG_VERSION}
# Waits for compute node is ready
function wait_for_ready {
TIME=0
while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
@@ -27,45 +23,11 @@ function wait_for_ready {
exit 2
fi
}
# Creates extensions. Gets a string with space-separated extensions as a parameter
function create_extensions() {
for ext in ${1}; do
docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
done
}
# Creates a new timeline. Gets the parent ID and an extension name as parameters.
# Saves the timeline ID in the variable EXT_TIMELINE
function create_timeline() {
generate_id new_timeline_id
PARAMS=(
-sbf
-X POST
-H "Content-Type: application/json"
-d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${1}\"}"
"http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/"
)
result=$(curl "${PARAMS[@]}")
echo $result | jq .
EXT_TIMELINE[${2}]=${new_timeline_id}
}
# Checks if the timeline ID of the compute node is expected. Gets the timeline ID as a parameter
function check_timeline() {
TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
if [ "${TID}" != "${1}" ]; then
echo Timeline mismatch
exit 1
fi
}
# Restarts the compute node with the required compute tag and timeline.
# Accepts the tag for the compute node and the timeline as parameters.
function restart_compute() {
docker compose down compute compute_is_ready
COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready
wait_for_ready
check_timeline ${2}
}
declare -A EXT_TIMELINE
EXTENSIONS='[
{"extname": "plv8", "extdir": "plv8-src"},
{"extname": "vector", "extdir": "pgvector-src"},
@@ -85,7 +47,7 @@ EXTENSIONS='[
{"extname": "pg_repack", "extdir": "pg_repack-src"}
]'
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
wait_for_ready
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
@@ -93,14 +55,13 @@ create_extensions "${EXTNAMES}"
query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
docker compose --profile test-extensions down
COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
wait_for_ready
docker compose cp ext-src neon-test-extensions:/
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id")
EXT_TIMELINE["main"]=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
create_timeline "${EXT_TIMELINE["main"]}" init
restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE["init"]}"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression"
docker compose exec neon-test-extensions psql -d pgtap_regression -c "CREATE EXTENSION pgtap"
create_extensions "${EXTNAMES}"
if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then
exts="${EXTNAMES}"
@@ -111,13 +72,29 @@ fi
if [ -z "${exts}" ]; then
echo "No extensions were upgraded"
else
tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id")
timeline_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
for ext in ${exts}; do
echo Testing ${ext}...
create_timeline "${EXT_TIMELINE["main"]}" ${ext}
EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir')
restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}"
docker compose exec neon-test-extensions psql -d contrib_regression -c "CREATE EXTENSION ${ext} CASCADE"
restart_compute "${NEW_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}"
generate_id new_timeline_id
PARAMS=(
-sbf
-X POST
-H "Content-Type: application/json"
-d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${timeline_id}\"}"
"http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/"
)
result=$(curl "${PARAMS[@]}")
echo $result | jq .
TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready
COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready
wait_for_ready
TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
if [ ${TID} != ${new_timeline_id} ]; then
echo Timeline mismatch
exit 1
fi
docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then
docker compose exec neon-test-extensions cat /ext-src/${EXTDIR}/regression.diffs

View File

@@ -1,72 +0,0 @@
# Neon changes to Postgres EXPLAIN command
## Why do we need to include more information in EXPLAIN?
Neon contains two components: prefetch and LFC (local file cache) which may have critical impact on query performance.
Both are trying to solve the problem with relatively large round-trip between compute and page server (much larger than average access time for modern SSDs).
This is why Neon can provide comparable performance only if all data set is present at local node.
Certainly the fastest case of accessing data in Postgres is when it is present in Postgres cache (shared buffers).
Unfortunately size of shared buffer can not be changed on the flight: it requires Postgres restart. It is not acceptable for autoscaling.
This is why we have relatively small shared buffers and dynamically resized local file cache (LFC).
It is intended that LFC fits in memory, although it can improve performance even if data is read from local disk.
See https://neondb.slack.com/archives/C03QLRH7PPD/p1718714926044699
To minimize in-memory footprint of LFC and to improve sequential scan, LFC uses chunks which size is larger than size of Postgres page (right now chunk size is 1Mb).
LFC as any other cache is useless after cool restart. Also some data sets can not fit in local disk.
This is where another approach can help: prefetching. If we are able to predict which pages will be needed soon,
compute can send prefetch requests to page server before this page is actually requested by executor.
Prefetch is also used by Postgres (using `fadvise`), but only for vacuum and bitmap scan. Neon provides prefetch for more execution plan nodes:
sequential scan, index scan (prefetch of referenced heap pages), index-only scan (prefetch B-Tree leaves).
As far as work of prefetch and LFC may have critical impact on query performance, we need to provide this information to the users.
The most convenient and natural way is to include it in EXPLAIN. Two new keyword are added by Neon to EXPLAIN options: `prefetch` and `filecache`.
## prefetch
The following information is available about prefetch:
* `hits` - number of pages which are received from page server before actually requested by executor. Prefetch distance is controlled by `effective_io_concurrency` GUC. The larger it is, the more chances that page server will be able to complete request before it is needed. But it should not be larger than `neon.prefetch_buffer_size`.
* `misses` - number of accessed pages which were not prefetched. Prefetch is not implemented for all plan nodes. And even for those nodes for which it is implemented (i.e. sequential scan) some mispredictions are possible. Please notice that `hits + misses != accessed pages`. If prefetch request for the page was issued but not yet completed before the page is requested, then such access is not considered as prefetch hit or miss.
* `expired` - page can be updated by backend since the moment of sending prefetch request to page server. Or result of prefetch just not used because executor doesn't need this page (for example because of presence of `LIMIT` clause in the query). In both cases such requests are considered as expired.
* `duplicates` - multiple prefetch requests for the same page. For some nodes predicting next pages is trivial, i.e. for sequential scan. But in case of index scan we need to prefetch referenced heap pages. And definitely index entries can have multiple references to the same heap page. Such non-unique prefetch requests are considered as duplicates.
## filecache
The following information is available about file cache (LFC):
* `hits` - number of accessed pages found in LFC.
* `misses` - number of accessed pages not found in LFC.
# LFC statistic
While `filecache` option of EXPLAIN command provides information about LFC usage in the particular query, there is also available global statistic about LFC usage.
It is provided by `neon` extension.
## `neon_lfc_stats` view
This view provides information as key-value pairs (so that new information can be added without changing neon extension interface).
The following keys are provided:
* `file_cache_hits` - total number of LFC hits (for all backends and queries since server startup).
* `file_cache_misses` - total number of LFC misses.
* `file_cache_used` - number chunks used in LFC
* `file_cache_writes` - number of pages written to LFC
* `file_cache_size` - current cache size in chunks (can not be larger than `neon.file_cache_size_limit`)
* `file_cache_used_pages` - number is used pages. As far as not all pages of the chunk can be filled with data it can be smaller than `file_cache_used*128` (128 is number of 8kB pages in 1MB chunk)
* `file_cache_evicted_pages` - number of pages evicted from LFC because working set doesn't fit in LFC.
* `file_cache_limit` - current limit of LFC size (in chunks)
## `local_cache` view
This view is similar with `pg_buffercache` view and contains the following columns:
```
(pageoffs int8,
relfilenode oid,
reltablespace oid,
reldatabase oid,
relforknumber int2,
relblocknumber int8,
accesscount int4)
```

View File

@@ -1,7 +1,7 @@
[package]
name = "compute_api"
version = "0.1.0"
edition = "2024"
edition.workspace = true
license.workspace = true
[dependencies]

View File

@@ -1,10 +1,11 @@
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
use crate::{
privilege::Privilege,
responses::ComputeCtlConfig,
spec::{ComputeSpec, ExtVersion, PgIdent},
};
use serde::{Deserialize, Serialize};
use crate::privilege::Privilege;
use crate::responses::ComputeCtlConfig;
use crate::spec::{ComputeSpec, ExtVersion, PgIdent};
/// Request of the /configure API
///
/// We now pass only `spec` in the configuration request, but later we can

View File

@@ -6,8 +6,10 @@ use chrono::{DateTime, Utc};
use jsonwebtoken::jwk::JwkSet;
use serde::{Deserialize, Serialize, Serializer};
use crate::privilege::Privilege;
use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role};
use crate::{
privilege::Privilege,
spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role},
};
#[derive(Serialize, Debug, Deserialize)]
pub struct GenericAPIError {
@@ -134,10 +136,8 @@ pub struct CatalogObjects {
pub databases: Vec<Database>,
}
#[derive(Clone, Debug, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize)]
pub struct ComputeCtlConfig {
/// Set of JSON web keys that the compute can use to authenticate
/// communication from the control plane.
pub jwks: JwkSet,
}

View File

@@ -5,12 +5,13 @@
//! and connect it to the storage nodes.
use std::collections::HashMap;
use regex::Regex;
use remote_storage::RemotePath;
use serde::{Deserialize, Serialize};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use regex::Regex;
use remote_storage::RemotePath;
/// String type alias representing Postgres identifier and
/// intended to be used for DB / role names.
pub type PgIdent = String;
@@ -101,17 +102,6 @@ pub struct ComputeSpec {
pub timeline_id: Option<TimelineId>,
pub pageserver_connstring: Option<String>,
/// Safekeeper membership config generation. It is put in
/// neon.safekeepers GUC and serves two purposes:
/// 1) Non zero value forces walproposer to use membership configurations.
/// 2) If walproposer wants to update list of safekeepers to connect to
/// taking them from some safekeeper mconf, it should check what value
/// is newer by comparing the generation.
///
/// Note: it could be SafekeeperGeneration, but this needs linking
/// compute_ctl with postgres_ffi.
#[serde(default)]
pub safekeepers_generation: Option<u32>,
#[serde(default)]
pub safekeeper_connstrings: Vec<String>,
@@ -349,9 +339,8 @@ pub struct JwksSettings {
#[cfg(test)]
mod tests {
use std::fs::File;
use super::*;
use std::fs::File;
#[test]
fn allow_installing_remote_extensions() {

View File

@@ -1,7 +1,7 @@
[package]
name = "consumption_metrics"
version = "0.1.0"
edition = "2024"
edition = "2021"
license = "Apache-2.0"
[dependencies]

View File

@@ -1,5 +1,4 @@
use std::collections::VecDeque;
use std::sync::Arc;
use std::{collections::VecDeque, sync::Arc};
use parking_lot::{Mutex, MutexGuard};

View File

@@ -1,7 +1,11 @@
use std::panic::AssertUnwindSafe;
use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU32, Ordering};
use std::sync::{Arc, OnceLock, mpsc};
use std::thread::JoinHandle;
use std::{
panic::AssertUnwindSafe,
sync::{
atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering},
mpsc, Arc, OnceLock,
},
thread::JoinHandle,
};
use tracing::{debug, error, trace};

View File

@@ -1,19 +1,26 @@
use std::cmp::Ordering;
use std::collections::{BinaryHeap, VecDeque};
use std::fmt::{self, Debug};
use std::ops::DerefMut;
use std::sync::{Arc, mpsc};
use std::{
cmp::Ordering,
collections::{BinaryHeap, VecDeque},
fmt::{self, Debug},
ops::DerefMut,
sync::{mpsc, Arc},
};
use parking_lot::lock_api::{MappedMutexGuard, MutexGuard};
use parking_lot::{Mutex, RawMutex};
use parking_lot::{
lock_api::{MappedMutexGuard, MutexGuard},
Mutex, RawMutex,
};
use rand::rngs::StdRng;
use tracing::debug;
use super::chan::Chan;
use super::proto::AnyMessage;
use crate::executor::{self, ThreadContext};
use crate::options::NetworkOptions;
use crate::proto::{NetEvent, NodeEvent};
use crate::{
executor::{self, ThreadContext},
options::NetworkOptions,
proto::NetEvent,
proto::NodeEvent,
};
use super::{chan::Chan, proto::AnyMessage};
pub struct NetworkTask {
options: Arc<NetworkOptions>,

Some files were not shown because too many files have changed in this diff Show More