Compare commits

..

1 Commits

Author SHA1 Message Date
fcdm
3a07c2d488 wip: merged compute image 2024-02-19 22:42:12 +00:00
308 changed files with 5552 additions and 16300 deletions

View File

@@ -16,9 +16,9 @@ assignees: ''
## Implementation ideas
## Tasks
```[tasklist]
- [ ] Example Task
### Tasks
```

View File

@@ -39,7 +39,7 @@ runs:
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
if [ "${PR_NUMBER}" != "null" ]; then
BRANCH_OR_PR=pr-${PR_NUMBER}
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
# Shortcut for special branches
BRANCH_OR_PR=${GITHUB_REF_NAME}
else
@@ -76,8 +76,8 @@ runs:
rm -f ${ALLURE_ZIP}
fi
env:
ALLURE_VERSION: 2.27.0
ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
ALLURE_VERSION: 2.24.0
ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
- name: Acquire lock

View File

@@ -19,7 +19,7 @@ runs:
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
if [ "${PR_NUMBER}" != "null" ]; then
BRANCH_OR_PR=pr-${PR_NUMBER}
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
# Shortcut for special branches
BRANCH_OR_PR=${GITHUB_REF_NAME}
else

View File

@@ -16,14 +16,8 @@ concurrency:
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
jobs:
check-permissions:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name}}
actionlint:
needs: [ check-permissions ]
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

View File

@@ -62,7 +62,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -214,7 +214,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
# Increase timeout to 8h, default timeout is 6h
@@ -362,7 +362,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -461,7 +461,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -558,7 +558,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:

View File

@@ -1,105 +0,0 @@
name: Build build-tools image
on:
workflow_call:
inputs:
image-tag:
description: "build-tools image tag"
required: true
type: string
outputs:
image-tag:
description: "build-tools tag"
value: ${{ inputs.image-tag }}
image:
description: "build-tools image"
value: neondatabase/build-tools:${{ inputs.image-tag }}
defaults:
run:
shell: bash -euo pipefail {0}
concurrency:
group: build-build-tools-image-${{ inputs.image-tag }}
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
jobs:
check-image:
uses: ./.github/workflows/check-build-tools-image.yml
# This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
build-image:
needs: [ check-image ]
if: needs.check-image.outputs.found == 'false'
strategy:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
env:
IMAGE_TAG: ${{ inputs.image-tag }}
steps:
- name: Check `input.tag` is correct
env:
INPUTS_IMAGE_TAG: ${{ inputs.image-tag }}
CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }}
run: |
if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then
echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})"
exit 1
fi
- uses: actions/checkout@v3
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p /tmp/.docker-custom
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v2
- uses: docker/login-action@v2
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/build-push-action@v4
with:
context: .
provenance: false
push: true
pull: true
file: Dockerfile.build-tools
cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
- name: Remove custom docker config directory
run: |
rm -rf /tmp/.docker-custom
merge-images:
needs: [ build-image ]
runs-on: ubuntu-latest
env:
IMAGE_TAG: ${{ inputs.image-tag }}
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch image
run: |
docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
neondatabase/build-tools:${IMAGE_TAG}-x64 \
neondatabase/build-tools:${IMAGE_TAG}-arm64

View File

@@ -0,0 +1,124 @@
name: Build and Push Docker Image
on:
workflow_call:
inputs:
dockerfile-path:
required: true
type: string
image-name:
required: true
type: string
outputs:
build-tools-tag:
description: "tag generated for build tools"
value: ${{ jobs.tag.outputs.build-tools-tag }}
jobs:
check-if-build-tools-dockerfile-changed:
runs-on: ubuntu-latest
outputs:
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
steps:
- name: Check if Dockerfile.buildtools has changed
id: dockerfile
run: |
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
exit
fi
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
fi
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
tag:
runs-on: ubuntu-latest
needs: [ check-if-build-tools-dockerfile-changed ]
outputs:
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
steps:
- name: Get buildtools tag
env:
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
run: |
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
IMAGE_TAG=$GITHUB_RUN_ID
else
IMAGE_TAG=pinned
fi
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
shell: bash
id: buildtools-tag
kaniko:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
needs: [ tag, check-if-build-tools-dockerfile-changed ]
runs-on: [ self-hosted, dev, x64 ]
container: gcr.io/kaniko-project/executor:v1.7.0-debug
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: |
/kaniko/executor \
--reproducible \
--snapshotMode=redo \
--skip-unused-stages \
--dockerfile ${{ inputs.dockerfile-path }} \
--cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
kaniko-arm:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
needs: [ tag, check-if-build-tools-dockerfile-changed ]
runs-on: [ self-hosted, dev, arm64 ]
container: gcr.io/kaniko-project/executor:v1.7.0-debug
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: |
/kaniko/executor \
--reproducible \
--snapshotMode=redo \
--skip-unused-stages \
--dockerfile ${{ inputs.dockerfile-path }} \
--cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
manifest:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
name: 'manifest'
runs-on: [ self-hosted, dev, x64 ]
needs:
- tag
- kaniko
- kaniko-arm
- check-if-build-tools-dockerfile-changed
steps:
- name: Create manifest
run: |
docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
- name: Push manifest
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}

View File

@@ -5,7 +5,6 @@ on:
branches:
- main
- release
- release-proxy
pull_request:
defaults:
@@ -28,9 +27,24 @@ env:
jobs:
check-permissions:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name}}
runs-on: ubuntu-latest
steps:
- name: Disallow PRs from forks
if: |
github.event_name == 'pull_request' &&
github.event.pull_request.head.repo.full_name != github.repository
run: |
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
else
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
fi
echo >&2 "We don't run CI for PRs from forks"
echo >&2 "${MESSAGE}"
exit 1
cancel-previous-e2e-tests:
needs: [ check-permissions ]
@@ -68,8 +82,6 @@ jobs:
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
@@ -77,25 +89,19 @@ jobs:
shell: bash
id: build-tag
check-build-tools-image:
build-buildtools-image:
needs: [ check-permissions ]
uses: ./.github/workflows/check-build-tools-image.yml
build-build-tools-image:
needs: [ check-build-tools-image ]
uses: ./.github/workflows/build-build-tools-image.yml
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
dockerfile-path: Dockerfile.buildtools
image-name: build-tools
secrets: inherit
check-codestyle-python:
needs: [ check-permissions, build-build-tools-image ]
needs: [ check-permissions, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
steps:
@@ -124,13 +130,10 @@ jobs:
run: poetry run mypy .
check-codestyle-rust:
needs: [ check-permissions, build-build-tools-image ]
needs: [ check-permissions, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
steps:
@@ -194,13 +197,10 @@ jobs:
run: cargo deny check --hide-inclusion-graph
build-neon:
needs: [ check-permissions, tag, build-build-tools-image ]
needs: [ check-permissions, tag, build-buildtools-image ]
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# Raise locked memory limit for tokio-epoll-uring.
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
# io_uring will account the memory of the CQ and SQ as locked.
@@ -438,13 +438,10 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
strategy:
@@ -475,7 +472,6 @@ jobs:
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
PAGESERVER_GET_VECTORED_IMPL: vectored
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
@@ -488,13 +484,10 @@ jobs:
get-benchmarks-durations:
outputs:
json: ${{ steps.get-benchmark-durations.outputs.json }}
needs: [ check-permissions, build-build-tools-image ]
needs: [ check-permissions, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
steps:
@@ -521,13 +514,10 @@ jobs:
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
benchmarks:
needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -559,15 +549,12 @@ jobs:
# while coverage is currently collected for the debug ones
create-test-report:
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
steps:
@@ -608,13 +595,10 @@ jobs:
})
coverage-report:
needs: [ check-permissions, regress-tests, build-build-tools-image ]
needs: [ check-permissions, regress-tests, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
strategy:
fail-fast: false
@@ -712,146 +696,166 @@ jobs:
})
trigger-e2e-tests:
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
needs: [ check-permissions, promote-images, tag ]
uses: ./.github/workflows/trigger-e2e-tests.yml
secrets: inherit
neon-image:
needs: [ check-permissions, build-build-tools-image, tag ]
needs: [ check-permissions, build-buildtools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container: gcr.io/kaniko-project/executor:v1.9.2-debug
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v1 # v3 won't work with kaniko
with:
submodules: true
fetch-depth: 0
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
- name: Configure ECR and Docker Hub login
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v3
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
echo "::add-mask::${DOCKERHUB_AUTH}"
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
cat <<-EOF > /kaniko/.docker/config.json
{
"auths": {
"https://index.docker.io/v1/": {
"auth": "${DOCKERHUB_AUTH}"
}
},
"credHelpers": {
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
}
}
EOF
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Kaniko build neon
run:
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
- uses: docker/build-push-action@v5
with:
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
provenance: false
push: true
pull: true
file: Dockerfile
cache-from: type=registry,ref=neondatabase/neon:cache
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
tags: |
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
neondatabase/neon:${{needs.tag.outputs.build-tag}}
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
- name: Remove custom docker config directory
if: always()
compute-tools-image:
runs-on: [ self-hosted, gen3, large ]
needs: [ check-permissions, build-buildtools-image, tag ]
container: gcr.io/kaniko-project/executor:v1.9.2-debug
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
- name: Configure ECR and Docker Hub login
run: |
rm -rf .docker-custom
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
echo "::add-mask::${DOCKERHUB_AUTH}"
cat <<-EOF > /kaniko/.docker/config.json
{
"auths": {
"https://index.docker.io/v1/": {
"auth": "${DOCKERHUB_AUTH}"
}
},
"credHelpers": {
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
}
}
EOF
- name: Kaniko build compute tools
run:
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-tools
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
--destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
compute-node-image:
needs: [ check-permissions, build-build-tools-image, tag ]
needs: [ check-permissions, build-buildtools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: gcr.io/kaniko-project/executor:v1.9.2-debug
# Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
# Should be prevented by https://github.com/neondatabase/neon/issues/4281
options: --add-host=download.osgeo.org:140.211.15.30
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v1 # v3 won't work with kaniko
with:
submodules: true
fetch-depth: 0
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
- name: Configure ECR and Docker Hub login
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v3
with:
# Disable parallelism for docker buildkit.
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
config-inline: |
[worker.oci]
max-parallelism = 1
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
echo "::add-mask::${DOCKERHUB_AUTH}"
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
cat <<-EOF > /kaniko/.docker/config.json
{
"auths": {
"https://index.docker.io/v1/": {
"auth": "${DOCKERHUB_AUTH}"
}
},
"credHelpers": {
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
}
}
EOF
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Kaniko build compute node with extensions
run:
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg PG_VERSION=${{ matrix.version }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-node
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--cleanup
- name: Build compute-node image
uses: docker/build-push-action@v5
with:
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
PG_VERSION=${{ matrix.version }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
provenance: false
push: true
pull: true
file: Dockerfile.compute-node
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
tags: |
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
if: ${{ matrix.version == 'v16' }}
uses: docker/build-push-action@v5
with:
target: compute-tools-image
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
provenance: false
push: true
pull: true
file: Dockerfile.compute-node
tags: |
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
@@ -895,7 +899,7 @@ jobs:
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ check-permissions, tag, neon-image, compute-node-image ]
needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
runs-on: [ self-hosted, gen3, small ]
steps:
@@ -929,8 +933,7 @@ jobs:
fi
- name: Verify docker-compose example
timeout-minutes: 20
run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
- name: Print logs and clean up
if: always()
@@ -963,7 +966,9 @@ jobs:
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
- name: Add latest tag to images
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
run: |
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -975,7 +980,9 @@ jobs:
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
- name: Push images to production ECR
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
run: |
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -999,7 +1006,9 @@ jobs:
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
- name: Push latest tags to Docker Hub
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
run: |
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -1089,7 +1098,7 @@ jobs:
deploy:
needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1124,26 +1133,14 @@ jobs:
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \
-f deployStorageBroker=true \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
-f deployPgSniRouter=true \
-f deployProxy=true \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}}
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
- name: Create git tag
if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
if: github.ref_name == 'release'
uses: actions/github-script@v7
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -1156,7 +1153,6 @@ jobs:
sha: context.sha,
})
# TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
- name: Create GitHub release
if: github.ref_name == 'release'
uses: actions/github-script@v7
@@ -1209,10 +1205,79 @@ jobs:
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
done
pin-build-tools-image:
needs: [ build-build-tools-image, promote-images, regress-tests ]
if: github.ref_name == 'main'
uses: ./.github/workflows/pin-build-tools-image.yml
with:
from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
secrets: inherit
compute-node-image-merged-base:
needs: [ check-permissions, build-buildtools-image, tag ]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure Docker Hub login
run: |
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
echo "::add-mask::${DOCKERHUB_AUTH}"
cat <<-EOF > ~/.docker/config.json
{
"auths": {
"https://index.docker.io/v1/": {
"auth": "${DOCKERHUB_AUTH}"
}
}
}
EOF
- name: Build merged image base
run: |
docker image build . -f Dockerfile.compute-node-simple -t neondatabase/tmp-compute-node-merged-base-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--build-arg PG_VERSION=${{ matrix.version }} \
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
docker image push neondatabase/tmp-compute-node-merged-base-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
compute-node-image-merged:
needs: [ tag, compute-node-image-merged-base ]
runs-on: ubuntu-latest
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure Docker Hub login
run: |
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
echo "::add-mask::${DOCKERHUB_AUTH}"
echo ~
cat <<-EOF > ~/.docker/config.json
{
"auths": {
"https://index.docker.io/v1/": {
"auth": "${DOCKERHUB_AUTH}"
}
}
}
EOF
- name: Build merged image
run: |
docker image build . -f Dockerfile.compute-node-merged -t neondatabase/tmp-compute-node-merged:${{needs.tag.outputs.build-tag}} \
--build-arg TAG=${{needs.tag.outputs.build-tag}}
docker image push neondatabase/tmp-compute-node-merged:${{needs.tag.outputs.build-tag}}

View File

@@ -1,58 +0,0 @@
name: Check build-tools image
on:
workflow_call:
outputs:
image-tag:
description: "build-tools image tag"
value: ${{ jobs.check-image.outputs.tag }}
found:
description: "Whether the image is found in the registry"
value: ${{ jobs.check-image.outputs.found }}
defaults:
run:
shell: bash -euo pipefail {0}
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
jobs:
check-image:
runs-on: ubuntu-latest
outputs:
tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
found: ${{ steps.check-image.outputs.found }}
steps:
- name: Get build-tools image tag for the current commit
id: get-build-tools-tag
env:
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
LAST_BUILD_TOOLS_SHA=$(
gh api \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
--method GET \
--field path=Dockerfile.build-tools \
--field sha=${COMMIT_SHA} \
--field per_page=1 \
--jq ".[0].sha" \
"/repos/${GITHUB_REPOSITORY}/commits"
)
echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
- name: Check if such tag found in the registry
id: check-image
env:
IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
run: |
if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
found=true
else
found=false
fi
echo "found=${found}" | tee -a $GITHUB_OUTPUT

View File

@@ -1,36 +0,0 @@
name: Check Permissions
on:
workflow_call:
inputs:
github-event-name:
required: true
type: string
defaults:
run:
shell: bash -euo pipefail {0}
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
jobs:
check-permissions:
runs-on: ubuntu-latest
steps:
- name: Disallow CI runs on PRs from forks
if: |
inputs.github-event-name == 'pull_request' &&
github.event.pull_request.head.repo.full_name != github.repository
run: |
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
else
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
fi
# TODO: use actions/github-script to post this message as a PR comment
echo >&2 "We don't run CI for PRs from forks"
echo >&2 "${MESSAGE}"
exit 1

View File

@@ -1,32 +0,0 @@
# A workflow from
# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries
name: cleanup caches by a branch
on:
pull_request:
types:
- closed
jobs:
cleanup:
runs-on: ubuntu-latest
steps:
- name: Cleanup
run: |
gh extension install actions/gh-actions-cache
echo "Fetching list of cache key"
cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )
## Setting this to not fail the workflow while deleting cache keys.
set +e
echo "Deleting caches..."
for cacheKey in $cacheKeysForPR
do
gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
done
echo "Done"
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge

View File

@@ -20,25 +20,7 @@ env:
COPT: '-Werror'
jobs:
check-permissions:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name}}
check-build-tools-image:
needs: [ check-permissions ]
uses: ./.github/workflows/check-build-tools-image.yml
build-build-tools-image:
needs: [ check-build-tools-image ]
uses: ./.github/workflows/build-build-tools-image.yml
with:
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
secrets: inherit
check-macos-build:
needs: [ check-permissions ]
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
@@ -134,8 +116,8 @@ jobs:
run: ./run_clippy.sh
check-linux-arm-build:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
runs-on: [ self-hosted, dev, arm64 ]
env:
@@ -148,10 +130,7 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
options: --init
steps:
@@ -258,15 +237,12 @@ jobs:
cargo nextest run --package remote_storage --test test_real_azure
check-codestyle-rust-arm:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
runs-on: [ self-hosted, dev, arm64 ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -333,17 +309,13 @@ jobs:
run: cargo deny check
gather-rust-build-stats:
needs: [ check-permissions, build-build-tools-image ]
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
env:

View File

@@ -1,72 +0,0 @@
name: 'Pin build-tools image'
on:
workflow_dispatch:
inputs:
from-tag:
description: 'Source tag'
required: true
type: string
workflow_call:
inputs:
from-tag:
description: 'Source tag'
required: true
type: string
defaults:
run:
shell: bash -euo pipefail {0}
concurrency:
group: pin-build-tools-image-${{ inputs.from-tag }}
permissions: {}
jobs:
tag-image:
runs-on: ubuntu-latest
env:
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: pinned
steps:
- name: Check if we really need to pin the image
id: check-manifests
run: |
docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json
if diff ${FROM_TAG}.json ${TO_TAG}.json; then
skip=true
else
skip=false
fi
echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
- uses: docker/login-action@v3
if: steps.check-manifests.outputs.skip == 'false'
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
if: steps.check-manifests.outputs.skip == 'false'
run: |
docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
neondatabase/build-tools:${FROM_TAG}
- uses: docker/login-action@v3
if: steps.check-manifests.outputs.skip == 'false'
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
if: steps.check-manifests.outputs.skip == 'false'
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
neondatabase/build-tools:${FROM_TAG}

View File

@@ -2,31 +2,12 @@ name: Create Release Branch
on:
schedule:
# It should be kept in sync with if-condition in jobs
- cron: '0 6 * * MON' # Storage release
- cron: '0 6 * * THU' # Proxy release
- cron: '0 6 * * 1'
workflow_dispatch:
inputs:
create-storage-release-branch:
type: boolean
description: 'Create Storage release PR'
required: false
create-proxy-release-branch:
type: boolean
description: 'Create Proxy release PR'
required: false
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
defaults:
run:
shell: bash -euo pipefail {0}
jobs:
create-storage-release-branch:
if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
runs-on: ubuntu-latest
create_release_branch:
runs-on: [ ubuntu-latest ]
permissions:
contents: write # for `git push`
@@ -37,67 +18,27 @@ jobs:
with:
ref: main
- name: Set environment variables
run: |
echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
- name: Get current date
id: date
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
- name: Create release branch
run: git checkout -b $RELEASE_BRANCH
run: git checkout -b releases/${{ steps.date.outputs.date }}
- name: Push new branch
run: git push origin $RELEASE_BRANCH
run: git push origin releases/${{ steps.date.outputs.date }}
- name: Create pull request into release
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
cat << EOF > body.md
## Release ${RELEASE_DATE}
## Release ${{ steps.date.outputs.date }}
**Please merge this Pull Request using 'Create a merge commit' button**
**Please merge this PR using 'Create a merge commit'!**
EOF
gh pr create --title "Release ${RELEASE_DATE}" \
gh pr create --title "Release ${{ steps.date.outputs.date }}" \
--body-file "body.md" \
--head "${RELEASE_BRANCH}" \
--head "releases/${{ steps.date.outputs.date }}" \
--base "release"
create-proxy-release-branch:
if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
runs-on: ubuntu-latest
permissions:
contents: write # for `git push`
steps:
- name: Check out code
uses: actions/checkout@v4
with:
ref: main
- name: Set environment variables
run: |
echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
- name: Create release branch
run: git checkout -b $RELEASE_BRANCH
- name: Push new branch
run: git push origin $RELEASE_BRANCH
- name: Create pull request into release
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
cat << EOF > body.md
## Proxy release ${RELEASE_DATE}
**Please merge this Pull Request using 'Create a merge commit' button**
EOF
gh pr create --title "Proxy release ${RELEASE_DATE}" \
--body-file "body.md" \
--head "${RELEASE_BRANCH}" \
--base "release-proxy"

View File

@@ -51,8 +51,6 @@ jobs:
echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')

View File

@@ -0,0 +1,70 @@
name: 'Update build tools image tag'
# This workflow it used to update tag of build tools in ECR.
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
on:
workflow_dispatch:
inputs:
from-tag:
description: 'Source tag'
required: true
type: string
to-tag:
description: 'Destination tag'
required: true
type: string
default: 'pinned'
defaults:
run:
shell: bash -euo pipefail {0}
permissions: {}
jobs:
tag-image:
runs-on: [ self-hosted, gen3, small ]
env:
ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
steps:
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/login-action@v2
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v2
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- uses: actions/setup-go@v5
with:
go-version: '1.21'
- name: Install crane
run: |
go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
- name: Copy images
run: |
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom

1
.gitignore vendored
View File

@@ -9,7 +9,6 @@ test_output/
neon.iml
/.neon
/integration_tests/.neon
compaction-suite-results.*
# Coverage
*.profraw

View File

@@ -1,10 +1,10 @@
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
/control_plane/attachment_service @neondatabase/storage
/libs/pageserver_api/ @neondatabase/storage
/control_plane/ @neondatabase/compute @neondatabase/storage
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/libs/vm_monitor/ @neondatabase/autoscaling
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
/pageserver/ @neondatabase/storage
/pgxn/ @neondatabase/compute
/proxy/ @neondatabase/proxy

View File

@@ -74,11 +74,16 @@ We're using the following approach to make it work:
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
## How do I make build-tools image "pinned"
## How do I add the "pinned" tag to an buildtools image?
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow.
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
or using GitHub CLI:
```bash
gh workflow -R neondatabase/neon run pin-build-tools-image.yml \
-f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e
```
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-f from-tag=6254913013 \
-f to-tag=pinned \
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
```

84
Cargo.lock generated
View File

@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "ahash"
version = "0.8.9"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
dependencies = [
"cfg-if",
"const-random",
@@ -284,7 +284,6 @@ dependencies = [
"diesel_migrations",
"futures",
"git-version",
"humantime",
"hyper",
"metrics",
"once_cell",
@@ -1389,9 +1388,9 @@ dependencies = [
[[package]]
name = "crc32c"
version = "0.6.5"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
dependencies = [
"rustc_version",
]
@@ -2959,9 +2958,9 @@ dependencies = [
[[package]]
name = "mio"
version = "0.8.11"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
dependencies = [
"libc",
"log",
@@ -3498,7 +3497,6 @@ dependencies = [
"num_cpus",
"once_cell",
"pageserver_api",
"pageserver_compaction",
"pin-project-lite",
"postgres",
"postgres-protocol",
@@ -3553,9 +3551,7 @@ dependencies = [
"const_format",
"enum-map",
"hex",
"humantime",
"humantime-serde",
"itertools",
"postgres_ffi",
"rand 0.8.5",
"serde",
@@ -3589,53 +3585,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "pageserver_compaction"
version = "0.1.0"
dependencies = [
"anyhow",
"async-compression",
"async-stream",
"async-trait",
"byteorder",
"bytes",
"chrono",
"clap",
"const_format",
"consumption_metrics",
"criterion",
"crossbeam-utils",
"either",
"fail",
"flate2",
"futures",
"git-version",
"hex",
"hex-literal",
"humantime",
"humantime-serde",
"itertools",
"metrics",
"once_cell",
"pageserver_api",
"pin-project-lite",
"rand 0.8.5",
"smallvec",
"svg_fmt",
"sync_wrapper",
"thiserror",
"tokio",
"tokio-io-timeout",
"tokio-util",
"tracing",
"tracing-error",
"tracing-subscriber",
"url",
"utils",
"walkdir",
"workspace_hack",
]
[[package]]
name = "parking"
version = "2.1.1"
@@ -4216,6 +4165,7 @@ dependencies = [
"thiserror",
"tikv-jemalloc-ctl",
"tikv-jemallocator",
"tls-listener",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
@@ -5794,10 +5744,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.36.0"
name = "tls-listener"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd"
dependencies = [
"futures-util",
"hyper",
"pin-project-lite",
"thiserror",
"tokio",
"tokio-rustls",
]
[[package]]
name = "tokio"
version = "1.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
dependencies = [
"backtrace",
"bytes",

View File

@@ -5,7 +5,6 @@ members = [
"control_plane",
"control_plane/attachment_service",
"pageserver",
"pageserver/compaction",
"pageserver/ctl",
"pageserver/client",
"pageserver/pagebench",
@@ -156,6 +155,7 @@ test-context = "0.1"
thiserror = "1.0"
tikv-jemallocator = "0.5"
tikv-jemalloc-ctl = "0.5"
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
tokio = { version = "1.17", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
@@ -199,7 +199,6 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
metrics = { version = "0.1", path = "./libs/metrics/" }
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
pageserver_client = { path = "./pageserver/client" }
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }

View File

@@ -786,22 +786,6 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
#########################################################################################
#
# Layer "pg_partman"
# compile pg_partman extension
#
#########################################################################################
FROM build-deps AS pg-partman-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
#########################################################################################
#
@@ -845,7 +829,6 @@ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -891,17 +874,7 @@ ENV BUILD_TAG=$BUILD_TAG
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#
# Final compute-tools image
#
#########################################################################################
FROM debian:bullseye-slim AS compute-tools-image
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#

View File

@@ -0,0 +1,83 @@
ARG TAG
FROM neondatabase/tmp-compute-node-merged-base-v14:$TAG as pg14
FROM neondatabase/tmp-compute-node-merged-base-v15:$TAG as pg15
FROM neondatabase/tmp-compute-node-merged-base-v16:$TAG as pg16
#########################################################################################
#
# Compile and run the Neon-specific `compute_ctl` binary
#
#########################################################################################
FROM neondatabase/build-tools:pinned AS compute-tools
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#
# Final layer
# Put it all together into the final image
#
#########################################################################################
FROM debian:bullseye-slim
ARG TAG
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
mkdir /var/db/postgres/pgbouncer && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute && \
chmod 0750 /var/db/postgres/pgbouncer && \
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
# create folder for file cache
mkdir -p -m 777 /neon/cache
COPY --from=pg14 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v14
COPY --from=pg15 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v15
COPY --from=pg16 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v16
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
# Install:
# libreadline8 for psql
# libicu67, locales for collations (including ICU and plpgsql_check)
# liblz4-1 for lz4
# libossp-uuid16 for extension ossp-uuid
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libxml2, libxslt1.1 for xml2
# libzstd1 for zstd
# libboost* for rdkit
# ca-certificates for communicating with s3 by compute_ctl
RUN apt update && \
apt install --no-install-recommends -y \
gdb \
libicu67 \
liblz4-1 \
libreadline8 \
libboost-iostreams1.74.0 \
libboost-regex1.74.0 \
libboost-serialization1.74.0 \
libboost-system1.74.0 \
libossp-uuid16 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
libprotobuf-c1 \
libsfcgal1 \
libxml2 \
libxslt1.1 \
libzstd1 \
libcurl4-openssl-dev \
locales \
procps \
ca-certificates && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
ENV LANG en_US.utf8
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -0,0 +1,159 @@
ARG PG_VERSION
ARG REPOSITORY=neondatabase
ARG IMAGE=build-tools
ARG TAG=pinned
ARG BUILD_TAG
#########################################################################################
#
# Layer "build-deps"
#
#########################################################################################
FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
#########################################################################################
#
# Layer "pg-build"
# Build Postgres from the neon postgres repository.
#
#########################################################################################
FROM build-deps AS pg-build
ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION} postgres
RUN cd postgres && \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
--with-icu --with-libxml --with-libxslt --with-lz4" && \
if [ "${PG_VERSION}" != "v14" ]; then \
# zstd is available only from PG15
export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
fi && \
eval $CONFIGURE_CMD && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
# Enable some of contrib extensions
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
# In vanilla postgres this function is limited to Postgres role superuser.
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
# so we do it here.
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
# the first loop is for pg_stat_statement extension version <= 1.6
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
if echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
fi; \
done; \
# the second loop is for pg_stat_statement extension versions >= 1.7,
# where pg_stat_statement_reset() got 3 additional arguments
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
if ! echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
fi; \
done
#########################################################################################
#
# Layer "neon-pg-ext-build"
# compile neon extensions
#
#########################################################################################
FROM build-deps AS neon-pg-ext-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_test_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_rmgr \
-s install && \
case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/hnsw \
-s install
#########################################################################################
#
# Compile and run the Neon-specific `compute_ctl` binary
#
#########################################################################################
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#
# Clean up postgres folder before inclusion
#
#########################################################################################
FROM neon-pg-ext-build AS postgres-cleanup-layer
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
RUN cd /usr/local/pgsql/bin && rm ecpg
# Remove headers that we won't need anymore - we've completed installation of all extensions
RUN rm -r /usr/local/pgsql/include
# Remove static postgresql libraries - all compilation is finished, so we
# can now remove these files - they must be included in other binaries by now
# if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Final layer
# Put it all together into the final image
#
#########################################################################################
FROM debian:bullseye-slim
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local/pgsql

32
Dockerfile.compute-tools Normal file
View File

@@ -0,0 +1,32 @@
# First transient image to build compute_tools binaries
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
ARG REPOSITORY=neondatabase
ARG IMAGE=build-tools
ARG TAG=pinned
ARG BUILD_TAG
FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
WORKDIR /home/nonroot
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
ARG RUSTC_WRAPPER=cachepot
ENV AWS_REGION=eu-central-1
ENV CACHEPOT_S3_KEY_PREFIX=cachepot
ARG CACHEPOT_BUCKET=neon-github-dev
#ARG AWS_ACCESS_KEY_ID
#ARG AWS_SECRET_ACCESS_KEY
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
COPY . .
RUN set -e \
&& mold -run cargo build -p compute_tools --locked --release \
&& cachepot -s
# Final image that only has one binary
FROM debian:bullseye-slim
COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl

View File

@@ -5,7 +5,7 @@
Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
## Quick start
Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
Alternatively, compile and run the project [locally](#running-local-installation).
@@ -230,12 +230,6 @@ postgres=# select * from t;
> cargo neon stop
```
More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
#### Handling build failures
If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
## Running tests
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
@@ -265,12 +259,6 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
## Cleanup
For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned!
## Documentation
[docs](/docs) Contains a top-level overview of all available markdown documentation.

View File

@@ -3,10 +3,3 @@ disallowed-methods = [
# Allow this for now, to deny it later once we stop using Handle::block_on completely
# "tokio::runtime::Handle::block_on",
]
disallowed-macros = [
# use std::pin::pin
"futures::pin_mut",
# cannot disallow this, because clippy finds used from tokio macros
#"tokio::pin",
]

View File

@@ -45,6 +45,7 @@ use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use nix::sys::signal::{kill, Signal};
use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info};
@@ -52,9 +53,7 @@ use url::Url;
use compute_api::responses::ComputeStatus;
use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
};
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version;
use compute_tools::http::api::launch_http_server;
@@ -395,15 +394,6 @@ fn main() -> Result<()> {
info!("synced safekeepers at lsn {lsn}");
}
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::TerminationPending {
state.status = ComputeStatus::Terminated;
compute.state_changed.notify_all();
// we were asked to terminate gracefully, don't exit to avoid restart
delay_exit = true
}
drop(state);
if let Err(err) = compute.check_for_core_dumps() {
error!("error while checking for core dumps: {err:?}");
}
@@ -533,7 +523,16 @@ fn cli() -> clap::Command {
/// wait for termination which would be easy then.
fn handle_exit_signal(sig: i32) {
info!("received {sig} termination signal");
forward_termination_signal();
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
if ss_pid != 0 {
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
kill(ss_pid, Signal::SIGTERM).ok();
}
let pg_pid = PG_PID.load(Ordering::SeqCst);
if pg_pid != 0 {
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
kill(pg_pid, Signal::SIGTERM).ok();
}
exit(1);
}

View File

@@ -17,8 +17,9 @@ use chrono::{DateTime, Utc};
use futures::future::join_all;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use postgres::error::SqlState;
use postgres::{Client, NoTls};
use tokio;
use tokio_postgres;
use tracing::{debug, error, info, instrument, warn};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -27,8 +28,6 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
use utils::measured_stream::MeasuredReader;
use nix::sys::signal::{kill, Signal};
use remote_storage::{DownloadError, RemotePath};
use crate::checker::create_availability_check_data;
@@ -764,26 +763,6 @@ impl ComputeNode {
Ok((pg, logs_handle))
}
/// Do post configuration of the already started Postgres. This function spawns a background thread to
/// configure the database after applying the compute spec. Currently, it upgrades the neon extension
/// version. In the future, it may upgrade all 3rd-party extensions.
#[instrument(skip_all)]
pub fn post_apply_config(&self) -> Result<()> {
let connstr = self.connstr.clone();
thread::spawn(move || {
let func = || {
let mut client = Client::connect(connstr.as_str(), NoTls)?;
handle_neon_extension_upgrade(&mut client)
.context("handle_neon_extension_upgrade")?;
Ok::<_, anyhow::Error>(())
};
if let Err(err) = func() {
error!("error while post_apply_config: {err:#}");
}
});
Ok(())
}
/// Do initial configuration of the already started Postgres.
#[instrument(skip_all)]
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
@@ -795,34 +774,27 @@ impl ComputeNode {
// but we can create a new one and grant it all privileges.
let connstr = self.connstr.clone();
let mut client = match Client::connect(connstr.as_str(), NoTls) {
Err(e) => match e.code() {
Some(&SqlState::INVALID_PASSWORD)
| Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
// connect with zenith_admin if cloud_admin could not authenticate
info!(
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
e
);
let mut zenith_admin_connstr = connstr.clone();
Err(e) => {
info!(
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
e
);
let mut zenith_admin_connstr = connstr.clone();
zenith_admin_connstr
.set_username("zenith_admin")
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
zenith_admin_connstr
.set_username("zenith_admin")
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
let mut client =
Client::connect(zenith_admin_connstr.as_str(), NoTls)
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
// Disable forwarding so that users don't get a cloud_admin role
client.simple_query("SET neon.forward_ddl = false")?;
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
drop(client);
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
// Disable forwarding so that users don't get a cloud_admin role
client.simple_query("SET neon.forward_ddl = false")?;
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
drop(client);
// reconnect with connstring with expected name
Client::connect(connstr.as_str(), NoTls)?
}
_ => return Err(e.into()),
},
// reconnect with connstring with expected name
Client::connect(connstr.as_str(), NoTls)?
}
Ok(client) => client,
};
@@ -1018,21 +990,18 @@ impl ComputeNode {
let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
let config_time = Utc::now();
if pspec.spec.mode == ComputeMode::Primary {
if !pspec.spec.skip_pg_catalog_updates {
let pgdata_path = Path::new(&self.pgdata);
// temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are applying config:
// creating new extensions, roles, etc...
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
self.pg_reload_conf()?;
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
let pgdata_path = Path::new(&self.pgdata);
// temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are applying config:
// creating new extensions, roles, etc...
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
self.pg_reload_conf()?;
self.apply_config(&compute_state)?;
self.apply_config(&compute_state)?;
config::compute_ctl_temp_override_remove(pgdata_path)?;
self.pg_reload_conf()?;
}
self.post_apply_config()?;
config::compute_ctl_temp_override_remove(pgdata_path)?;
self.pg_reload_conf()?;
}
let startup_end_time = Utc::now();
@@ -1353,17 +1322,3 @@ LIMIT 100",
Ok(remote_ext_metrics)
}
}
pub fn forward_termination_signal() {
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
if ss_pid != 0 {
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
kill(ss_pid, Signal::SIGTERM).ok();
}
let pg_pid = PG_PID.load(Ordering::SeqCst);
if pg_pid != 0 {
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
// use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
kill(pg_pid, Signal::SIGQUIT).ok();
}
}

View File

@@ -82,12 +82,6 @@ pub fn write_postgres_conf(
ComputeMode::Replica => {
// hot_standby is 'on' by default, but let's be explicit
writeln!(file, "hot_standby=on")?;
// Inform the replica about the primary state
// Default is 'false'
if let Some(primary_is_running) = spec.primary_is_running {
writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
}
}
}

View File

@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
}
}
*/
use anyhow::Result;
use anyhow::{self, Result};
use anyhow::{bail, Context};
use bytes::Bytes;
use compute_api::spec::RemoteExtSpec;

View File

@@ -5,7 +5,6 @@ use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
@@ -13,6 +12,8 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
use anyhow::Result;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use num_cpus;
use serde_json;
use tokio::task;
use tracing::{error, info, warn};
use tracing_utils::http::OtelName;
@@ -122,17 +123,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
(&Method::POST, "/terminate") => {
info!("serving /terminate POST request");
match handle_terminate_request(compute).await {
Ok(()) => Response::new(Body::empty()),
Err((msg, code)) => {
error!("error handling /terminate request: {msg}");
render_json_error(&msg, code)
}
}
}
// download extension files from remote extension storage on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
@@ -307,49 +297,6 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
.unwrap()
}
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
{
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::Terminated {
return Ok(());
}
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for termination request: {:?}",
state.status.clone()
);
return Err((msg, StatusCode::PRECONDITION_FAILED));
}
state.status = ComputeStatus::TerminationPending;
compute.state_changed.notify_all();
drop(state);
}
forward_termination_signal();
info!("sent signal and notified waiters");
// Spawn a blocking thread to wait for compute to become Terminated.
// This is needed to do not block the main pool of workers and
// be able to serve other requests while some particular request
// is waiting for compute to finish configuration.
let c = compute.clone();
task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Terminated {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become Terminated, current status: {:?}",
state.status
);
}
Ok(())
})
.await
.unwrap()?;
info!("terminated Postgres");
Ok(())
}
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(port: u16, state: Arc<ComputeNode>) {

View File

@@ -168,29 +168,6 @@ paths:
schema:
$ref: "#/components/schemas/GenericError"
/terminate:
post:
tags:
- Terminate
summary: Terminate Postgres and wait for it to exit
description: ""
operationId: terminate
responses:
200:
description: Result
412:
description: "wrong state"
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: "Unexpected error"
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:
JWT:

View File

@@ -655,9 +655,6 @@ pub fn handle_grants(
// remove this code if possible. The worst thing that could happen is that
// user won't be able to use public schema in NEW databases created in the
// very OLD project.
//
// Also, alter default permissions so that relations created by extensions can be
// used by neon_superuser without permission issues.
let grant_query = "DO $$\n\
BEGIN\n\
IF EXISTS(\n\
@@ -676,15 +673,6 @@ pub fn handle_grants(
GRANT CREATE ON SCHEMA public TO web_access;\n\
END IF;\n\
END IF;\n\
IF EXISTS(\n\
SELECT nspname\n\
FROM pg_catalog.pg_namespace\n\
WHERE nspname = 'public'\n\
)\n\
THEN\n\
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
END IF;\n\
END\n\
$$;"
.to_string();
@@ -744,17 +732,7 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
// - extension was just installed
// - extension was already installed and is up to date
let query = "ALTER EXTENSION neon UPDATE";
info!("update neon extension version with query: {}", query);
client.simple_query(query)?;
Ok(())
}
#[instrument(skip_all)]
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
info!("handle neon extension upgrade");
let query = "ALTER EXTENSION neon UPDATE";
info!("update neon extension version with query: {}", query);
info!("update neon extension schema with query: {}", query);
client.simple_query(query)?;
Ok(())
@@ -799,12 +777,6 @@ BEGIN
END
$$;"#,
"GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
// Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
"",
"",
"",
"",
// Add new migrations below.
];
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
@@ -831,13 +803,8 @@ $$;"#,
client.simple_query(query)?;
while current_migration < migrations.len() {
let migration = &migrations[current_migration];
if migration.is_empty() {
info!("Skip migration id={}", current_migration);
} else {
info!("Running migration:\n{}\n", migration);
client.simple_query(migration)?;
}
info!("Running migration:\n{}\n", migrations[current_migration]);
client.simple_query(migrations[current_migration])?;
current_migration += 1;
}
let setval = format!(

View File

@@ -1,26 +0,0 @@
# Control Plane and Neon Local
This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
## Example: Start with Postgres 16
To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
```shell
cargo neon init --pg-version 16
cargo neon start
cargo neon tenant create --set-default --pg-version 16
cargo neon endpoint create main --pg-version 16
cargo neon endpoint start main
```
## Example: Create Test User and Database
By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
```shell
cargo neon endpoint create main --pg-version 16 --update-catalog true
cargo neon endpoint start main --create-test-user true
```
The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.

View File

@@ -18,7 +18,6 @@ clap.workspace = true
futures.workspace = true
git-version.workspace = true
hyper.workspace = true
humantime.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true

View File

@@ -1,2 +0,0 @@
ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;

View File

@@ -1,4 +0,0 @@
ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;

View File

@@ -1,9 +0,0 @@
use utils::auth::{AuthError, Claims, Scope};
pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
if claims.scope != required_scope {
return Err(AuthError("Scope mismatch. Permission denied".into()));
}
Ok(())
}

View File

@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
use control_plane::local_env::LocalEnv;
use hyper::{Method, StatusCode};
use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
use postgres_connection::parse_host_port;
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
@@ -19,66 +19,8 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
pub(crate) const API_CONCURRENCY: usize = 32;
struct ShardedComputeHookTenant {
stripe_size: ShardStripeSize,
shard_count: ShardCount,
shards: Vec<(ShardNumber, NodeId)>,
}
enum ComputeHookTenant {
Unsharded(NodeId),
Sharded(ShardedComputeHookTenant),
}
impl ComputeHookTenant {
/// Construct with at least one shard's information
fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self {
if tenant_shard_id.shard_count.count() > 1 {
Self::Sharded(ShardedComputeHookTenant {
shards: vec![(tenant_shard_id.shard_number, node_id)],
stripe_size,
shard_count: tenant_shard_id.shard_count,
})
} else {
Self::Unsharded(node_id)
}
}
/// Set one shard's location. If stripe size or shard count have changed, Self is reset
/// and drops existing content.
fn update(
&mut self,
tenant_shard_id: TenantShardId,
stripe_size: ShardStripeSize,
node_id: NodeId,
) {
match self {
Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
*existing_node_id = node_id
}
Self::Sharded(sharded_tenant)
if sharded_tenant.stripe_size == stripe_size
&& sharded_tenant.shard_count == tenant_shard_id.shard_count =>
{
if let Some(existing) = sharded_tenant
.shards
.iter()
.position(|s| s.0 == tenant_shard_id.shard_number)
{
sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id;
} else {
sharded_tenant
.shards
.push((tenant_shard_id.shard_number, node_id));
sharded_tenant.shards.sort_by_key(|s| s.0)
}
}
_ => {
// Shard count changed: reset struct.
*self = Self::new(tenant_shard_id, stripe_size, node_id);
}
}
}
pub(super) struct ComputeHookTenant {
shards: Vec<(ShardIndex, NodeId)>,
}
#[derive(Serialize, Deserialize, Debug)]
@@ -91,7 +33,6 @@ struct ComputeHookNotifyRequestShard {
#[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequest {
tenant_id: TenantId,
stripe_size: Option<ShardStripeSize>,
shards: Vec<ComputeHookNotifyRequestShard>,
}
@@ -122,43 +63,42 @@ pub(crate) enum NotifyError {
}
impl ComputeHookTenant {
fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
match self {
Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
tenant_id,
shards: vec![ComputeHookNotifyRequestShard {
shard_number: ShardNumber(0),
node_id: *node_id,
}],
stripe_size: None,
}),
Self::Sharded(sharded_tenant)
if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
{
Some(ComputeHookNotifyRequest {
tenant_id,
shards: sharded_tenant
.shards
.iter()
.map(|(shard_number, node_id)| ComputeHookNotifyRequestShard {
shard_number: *shard_number,
node_id: *node_id,
})
.collect(),
stripe_size: Some(sharded_tenant.stripe_size),
})
}
Self::Sharded(sharded_tenant) => {
// Sharded tenant doesn't yet have information for all its shards
async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
// Find the highest shard count and drop any shards that aren't
// for that shard count.
let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
let Some(shard_count) = shard_count else {
// No shards, nothing to do.
tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
return None;
};
tracing::info!(
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
sharded_tenant.shards.len(),
sharded_tenant.shard_count.count()
);
None
}
self.shards.retain(|(k, _v)| k.shard_count == shard_count);
self.shards
.sort_by_key(|(shard, _node_id)| shard.shard_number);
if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
// We have pageservers for all the shards: emit a configuration update
return Some(ComputeHookNotifyRequest {
tenant_id,
shards: self
.shards
.iter()
.map(|(shard, node_id)| ComputeHookNotifyRequestShard {
shard_number: shard.shard_number,
node_id: *node_id,
})
.collect(),
});
} else {
tracing::info!(
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
self.shards.len(),
shard_count.count()
);
}
None
}
}
@@ -199,11 +139,7 @@ impl ComputeHook {
};
let cplane =
ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
let ComputeHookNotifyRequest {
tenant_id,
shards,
stripe_size,
} = reconfigure_request;
let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request;
let compute_pageservers = shards
.into_iter()
@@ -220,9 +156,7 @@ impl ComputeHook {
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
endpoint
.reconfigure(compute_pageservers.clone(), stripe_size)
.await?;
endpoint.reconfigure(compute_pageservers.clone()).await?;
}
}
@@ -337,26 +271,30 @@ impl ComputeHook {
&self,
tenant_shard_id: TenantShardId,
node_id: NodeId,
stripe_size: ShardStripeSize,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let mut locked = self.state.lock().await;
let entry = locked
.entry(tenant_shard_id.tenant_id)
.or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
use std::collections::hash_map::Entry;
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
tenant_shard_id,
stripe_size,
node_id,
)),
Entry::Occupied(e) => {
let tenant = e.into_mut();
tenant.update(tenant_shard_id, stripe_size, node_id);
tenant
}
let shard_index = ShardIndex {
shard_count: tenant_shard_id.shard_count,
shard_number: tenant_shard_id.shard_number,
};
let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
let mut set = false;
for (existing_shard, existing_node) in &mut entry.shards {
if *existing_shard == shard_index {
*existing_node = node_id;
set = true;
}
}
if !set {
entry.shards.push((shard_index, node_id));
}
let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await;
let Some(reconfigure_request) = reconfigure_request else {
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
// until it does.
@@ -378,85 +316,3 @@ impl ComputeHook {
}
}
}
#[cfg(test)]
pub(crate) mod tests {
use pageserver_api::shard::{ShardCount, ShardNumber};
use utils::id::TenantId;
use super::*;
#[test]
fn tenant_updates() -> anyhow::Result<()> {
let tenant_id = TenantId::generate();
let mut tenant_state = ComputeHookTenant::new(
TenantShardId {
tenant_id,
shard_count: ShardCount::new(0),
shard_number: ShardNumber(0),
},
ShardStripeSize(12345),
NodeId(1),
);
// An unsharded tenant is always ready to emit a notification
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.shards
.len(),
1
);
assert!(tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size
.is_none());
// Writing the first shard of a multi-sharded situation (i.e. in a split)
// resets the tenant state and puts it in an non-notifying state (need to
// see all shards)
tenant_state.update(
TenantShardId {
tenant_id,
shard_count: ShardCount::new(2),
shard_number: ShardNumber(1),
},
ShardStripeSize(32768),
NodeId(1),
);
assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
// Writing the second shard makes it ready to notify
tenant_state.update(
TenantShardId {
tenant_id,
shard_count: ShardCount::new(2),
shard_number: ShardNumber(0),
},
ShardStripeSize(32768),
NodeId(1),
);
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.shards
.len(),
2
);
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size,
Some(ShardStripeSize(32768))
);
Ok(())
}
}

View File

@@ -1,19 +1,18 @@
use crate::reconciler::ReconcileError;
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
use crate::PlacementPolicy;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest,
TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TimelineCreateRequest,
};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use std::sync::Arc;
use std::time::{Duration, Instant};
use utils::auth::{Scope, SwappableJwtAuth};
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
use utils::http::request::{must_get_query_param, parse_request_param};
use utils::auth::SwappableJwtAuth;
use utils::http::endpoint::{auth_middleware, request_span};
use utils::http::request::parse_request_param;
use utils::id::{TenantId, TimelineId};
use utils::{
@@ -26,12 +25,12 @@ use utils::{
id::NodeId,
};
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
};
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
use control_plane::attachment_service::{
AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
TenantShardMigrateRequest,
};
/// State available to HTTP request handlers
#[derive(Clone)]
@@ -65,18 +64,21 @@ fn get_state(request: &Request<Body>) -> &HttpState {
/// Pageserver calls into this on startup, to learn which tenants it should attach
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::GenerationsApi)?;
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
json_response(
StatusCode::OK,
state
.service
.re_attach(reattach_req)
.await
.map_err(ApiError::InternalServerError)?,
)
}
/// Pageserver calls into this before doing deletions, to confirm that it still
/// holds the latest generation for the tenants with deletions enqueued
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::GenerationsApi)?;
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.validate(validate_req))
@@ -86,8 +88,6 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
/// (in the real control plane this is unnecessary, because the same program is managing
/// generation numbers and doing attachments).
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
let state = get_state(&req);
@@ -102,8 +102,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
}
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
let state = get_state(&req);
@@ -115,18 +113,8 @@ async fn handle_tenant_create(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::PageServerApi)?;
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
// TODO: enable specifying this. Using Single as a default helps legacy tests to work (they
// have no expectation of HA).
let placement_policy = PlacementPolicy::Single;
json_response(
StatusCode::CREATED,
service.tenant_create(create_req, placement_policy).await?,
)
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
}
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
@@ -180,8 +168,6 @@ async fn handle_tenant_location_config(
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
json_response(
StatusCode::OK,
@@ -191,76 +177,11 @@ async fn handle_tenant_location_config(
)
}
async fn handle_tenant_config_set(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::PageServerApi)?;
let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
}
async fn handle_tenant_config_get(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
}
async fn handle_tenant_time_travel_remote_storage(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
let timestamp_raw = must_get_query_param(&req, "travel_to")?;
let _timestamp = humantime::parse_rfc3339(&timestamp_raw).map_err(|_e| {
ApiError::BadRequest(anyhow::anyhow!(
"Invalid time for travel_to: {timestamp_raw:?}"
))
})?;
let done_if_after_raw = must_get_query_param(&req, "done_if_after")?;
let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| {
ApiError::BadRequest(anyhow::anyhow!(
"Invalid time for done_if_after: {done_if_after_raw:?}"
))
})?;
service
.tenant_time_travel_remote_storage(
&time_travel_req,
tenant_id,
timestamp_raw,
done_if_after_raw,
)
.await?;
json_response(StatusCode::OK, ())
}
async fn handle_tenant_secondary_download(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
service.tenant_secondary_download(tenant_id).await?;
json_response(StatusCode::OK, ())
}
async fn handle_tenant_delete(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
deletion_wrapper(service, move |service| async move {
service.tenant_delete(tenant_id).await
@@ -273,11 +194,9 @@ async fn handle_tenant_timeline_create(
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
json_response(
StatusCode::CREATED,
StatusCode::OK,
service
.tenant_timeline_create(tenant_id, create_req)
.await?,
@@ -289,8 +208,6 @@ async fn handle_tenant_timeline_delete(
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
deletion_wrapper(service, move |service| async move {
@@ -304,7 +221,6 @@ async fn handle_tenant_timeline_passthrough(
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let Some(path) = req.uri().path_and_query() else {
// This should never happen, our request router only calls us if there is a path
@@ -348,15 +264,11 @@ async fn handle_tenant_locate(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
let state = get_state(&req);
state.service.node_register(register_req).await?;
@@ -364,23 +276,17 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
}
async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.node_list().await?)
}
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
}
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let node_id: NodeId = parse_request_param(&req, "node_id")?;
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
if node_id != config_req.node_id {
@@ -390,18 +296,13 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
}
let state = get_state(&req);
json_response(
StatusCode::OK,
state.service.node_configure(config_req).await?,
)
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
}
async fn handle_tenant_shard_split(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
@@ -415,8 +316,6 @@ async fn handle_tenant_shard_migrate(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
json_response(
@@ -429,30 +328,22 @@ async fn handle_tenant_shard_migrate(
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
}
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
state.service.tenants_dump()
}
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
state.service.scheduler_dump()
}
async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -509,12 +400,6 @@ where
.await
}
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
check_permission_with(request, |claims| {
crate::auth::check_permission(claims, required_scope)
})
}
pub fn make_router(
service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>,
@@ -586,21 +471,9 @@ pub fn make_router(
.delete("/v1/tenant/:tenant_id", |r| {
tenant_service_handler(r, handle_tenant_delete)
})
.put("/v1/tenant/config", |r| {
tenant_service_handler(r, handle_tenant_config_set)
})
.get("/v1/tenant/:tenant_id/config", |r| {
tenant_service_handler(r, handle_tenant_config_get)
})
.put("/v1/tenant/:tenant_id/location_config", |r| {
tenant_service_handler(r, handle_tenant_location_config)
})
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
})
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
tenant_service_handler(r, handle_tenant_secondary_download)
})
// Timeline operations
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
tenant_service_handler(r, handle_tenant_timeline_delete)

View File

@@ -1,7 +1,6 @@
use serde::{Deserialize, Serialize};
use utils::seqwait::MonotonicCounter;
mod auth;
mod compute_hook;
pub mod http;
pub mod metrics;
@@ -13,20 +12,14 @@ mod schema;
pub mod service;
mod tenant_state;
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
#[derive(Clone, Serialize, Deserialize, Debug)]
enum PlacementPolicy {
/// Cheapest way to attach a tenant: just one pageserver, no secondary
Single,
/// Production-ready way to attach a tenant: one attached pageserver and
/// some number of secondaries.
Double(usize),
/// Create one secondary mode locations. This is useful when onboarding
/// a tenant, or for an idle tenant that we might want to bring online quickly.
Secondary,
/// Do not attach to any pageservers. This is appropriate for tenants that
/// have been idle for a long time, where we do not mind some delay in making
/// them available in future.
/// Do not attach to any pageservers
Detached,
}

View File

@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service};
use aws_config::{BehaviorVersion, Region};
use aws_config::{self, BehaviorVersion, Region};
use camino::Utf8PathBuf;
use clap::Parser;
use diesel::Connection;
@@ -79,38 +79,13 @@ impl Secrets {
"neon-storage-controller-control-plane-jwt-token";
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
/// Load secrets from, in order of preference:
/// - CLI args if database URL is provided on the CLI
/// - Environment variables if DATABASE_URL is set.
/// - AWS Secrets Manager secrets
async fn load(args: &Cli) -> anyhow::Result<Self> {
match &args.database_url {
Some(url) => Self::load_cli(url, args),
None => match std::env::var(Self::DATABASE_URL_ENV) {
Ok(database_url) => Self::load_env(database_url),
Err(_) => Self::load_aws_sm().await,
},
None => Self::load_aws_sm().await,
}
}
fn load_env(database_url: String) -> anyhow::Result<Self> {
let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
Err(_) => None,
};
Ok(Self {
database_url,
public_key,
jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
})
}
async fn load_aws_sm() -> anyhow::Result<Self> {
let Ok(region) = std::env::var("AWS_REGION") else {
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");

View File

@@ -1,4 +1,4 @@
use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use serde::Serialize;
use utils::id::NodeId;
@@ -10,7 +10,7 @@ use crate::persistence::NodePersistence;
///
/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
/// implementation of serialization on this type is only for debug dumps.
#[derive(Clone, Serialize)]
#[derive(Clone, Serialize, Eq, PartialEq)]
pub(crate) struct Node {
pub(crate) id: NodeId,

View File

@@ -6,12 +6,10 @@ use std::time::Duration;
use self::split_state::SplitState;
use camino::Utf8Path;
use camino::Utf8PathBuf;
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use diesel::pg::PgConnection;
use diesel::{
Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
Selectable, SelectableHelper,
};
use pageserver_api::controller_api::NodeSchedulingPolicy;
use diesel::prelude::*;
use diesel::Connection;
use pageserver_api::models::TenantConfig;
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
use serde::{Deserialize, Serialize};
@@ -132,10 +130,24 @@ impl Persistence {
}
/// At startup, populate the list of nodes which our shards may be placed on
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
let nodes: Vec<NodePersistence> = self
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
let nodes: Vec<Node> = self
.with_conn(move |conn| -> DatabaseResult<_> {
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
Ok(crate::schema::nodes::table
.load::<NodePersistence>(conn)?
.into_iter()
.map(|n| Node {
id: NodeId(n.node_id as u64),
// At startup we consider a node offline until proven otherwise.
availability: NodeAvailability::Offline,
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
.expect("Bad scheduling policy in DB"),
listen_http_addr: n.listen_http_addr,
listen_http_port: n.listen_http_port as u16,
listen_pg_addr: n.listen_pg_addr,
listen_pg_port: n.listen_pg_port as u16,
})
.collect::<Vec<Node>>())
})
.await?;
@@ -144,31 +156,6 @@ impl Persistence {
Ok(nodes)
}
pub(crate) async fn update_node(
&self,
input_node_id: NodeId,
input_scheduling: NodeSchedulingPolicy,
) -> DatabaseResult<()> {
use crate::schema::nodes::dsl::*;
let updated = self
.with_conn(move |conn| {
let updated = diesel::update(nodes)
.filter(node_id.eq(input_node_id.0 as i64))
.set((scheduling_policy.eq(String::from(input_scheduling)),))
.execute(conn)?;
Ok(updated)
})
.await?;
if updated != 1 {
Err(DatabaseError::Logical(format!(
"Node {node_id:?} not found for update",
)))
} else {
Ok(())
}
}
/// At startup, load the high level state for shards, such as their config + policy. This will
/// be enriched at runtime with state discovered on pageservers.
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -333,15 +320,7 @@ impl Persistence {
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount::new(tsp.shard_count as u8),
};
let Some(g) = tsp.generation else {
// If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
// we only set generation_pageserver when setting generation.
return Err(DatabaseError::Logical(
"Generation should always be set after incrementing".to_string(),
));
};
result.insert(tenant_shard_id, Generation::new(g as u32));
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
}
Ok(result)
@@ -374,85 +353,7 @@ impl Persistence {
})
.await?;
// Generation is always non-null in the rseult: if the generation column had been NULL, then we
// should have experienced an SQL Confilict error while executing a query that tries to increment it.
debug_assert!(updated.generation.is_some());
let Some(g) = updated.generation else {
return Err(DatabaseError::Logical(
"Generation should always be set after incrementing".to_string(),
)
.into());
};
Ok(Generation::new(g as u32))
}
/// For use when updating a persistent property of a tenant, such as its config or placement_policy.
///
/// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
/// API: use [`Self::increment_generation`] instead. Setting the generation via this route is a one-time thing
/// that we only do the first time a tenant is set to an attached policy via /location_config.
pub(crate) async fn update_tenant_shard(
&self,
tenant_shard_id: TenantShardId,
input_placement_policy: PlacementPolicy,
input_config: TenantConfig,
input_generation: Option<Generation>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
let query = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
if let Some(input_generation) = input_generation {
// Update includes generation column
query
.set((
generation.eq(Some(input_generation.into().unwrap() as i32)),
placement_policy
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
} else {
// Update does not include generation column
query
.set((
placement_policy
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
}
Ok(())
})
.await?;
Ok(())
}
pub(crate) async fn update_tenant_config(
&self,
input_tenant_id: TenantId,
input_config: TenantConfig,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
diesel::update(tenant_shards)
.filter(tenant_id.eq(input_tenant_id.to_string()))
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
.execute(conn)?;
Ok(())
})
.await?;
Ok(())
Ok(Generation::new(updated.generation as u32))
}
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
@@ -463,7 +364,7 @@ impl Persistence {
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.set((
generation_pageserver.eq(Option::<i64>::None),
generation_pageserver.eq(i64::MAX),
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
))
.execute(conn)?;
@@ -589,15 +490,12 @@ pub(crate) struct TenantShardPersistence {
pub(crate) shard_stripe_size: i32,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching.
//
// Generation is only None when first onboarding a tenant, where it may
// be in PlacementPolicy::Secondary and therefore have no valid generation state.
pub(crate) generation: Option<i32>,
// and use the incremented number when attaching
pub(crate) generation: i32,
// Currently attached pageserver
#[serde(rename = "pageserver")]
pub(crate) generation_pageserver: Option<i64>,
pub(crate) generation_pageserver: i64,
#[serde(default)]
pub(crate) placement_policy: String,
@@ -608,7 +506,7 @@ pub(crate) struct TenantShardPersistence {
}
/// Parts of [`crate::node::Node`] that are stored durably
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
#[diesel(table_name = crate::schema::nodes)]
pub(crate) struct NodePersistence {
pub(crate) node_id: i64,

View File

@@ -1,6 +1,6 @@
use crate::persistence::Persistence;
use crate::service;
use pageserver_api::controller_api::NodeAvailability;
use control_plane::attachment_service::NodeAvailability;
use pageserver_api::models::{
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
};
@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
/// of a tenant's state from when we spawned a reconcile task.
pub(super) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity,
pub(crate) generation: Option<Generation>,
pub(crate) generation: Generation,
pub(crate) intent: TargetState,
pub(crate) config: TenantConfig,
pub(crate) observed: ObservedState,
@@ -104,7 +104,6 @@ impl Reconciler {
node_id: NodeId,
config: LocationConfig,
flush_ms: Option<Duration>,
lazy: bool,
) -> anyhow::Result<()> {
let node = self
.pageservers
@@ -119,7 +118,7 @@ impl Reconciler {
let client =
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
client
.location_config(self.tenant_shard_id, config.clone(), flush_ms, lazy)
.location_config(self.tenant_shard_id, config.clone(), flush_ms)
.await?;
tracing::info!("location_config({}) complete: {:?}", node_id, config);
@@ -313,16 +312,11 @@ impl Reconciler {
&self.shard,
&self.config,
LocationConfigMode::AttachedStale,
self.generation,
Some(self.generation),
None,
);
self.location_config(
origin_ps_id,
stale_conf,
Some(Duration::from_secs(10)),
false,
)
.await?;
self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
.await?;
let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
@@ -341,23 +335,21 @@ impl Reconciler {
}
// Increment generation before attaching to new pageserver
self.generation = Some(
self.persistence
.increment_generation(self.tenant_shard_id, dest_ps_id)
.await?,
);
self.generation = self
.persistence
.increment_generation(self.tenant_shard_id, dest_ps_id)
.await?;
let dest_conf = build_location_config(
&self.shard,
&self.config,
LocationConfigMode::AttachedMulti,
self.generation,
Some(self.generation),
None,
);
tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
self.location_config(dest_ps_id, dest_conf, None, false)
.await?;
self.location_config(dest_ps_id, dest_conf, None).await?;
if let Some(baseline) = baseline_lsns {
tracing::info!("🕑 Waiting for LSN to catch up...");
@@ -389,7 +381,7 @@ impl Reconciler {
None,
Some(LocationConfigSecondary { warm: true }),
);
self.location_config(origin_ps_id, origin_secondary_conf.clone(), None, false)
self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
.await?;
// TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
// partway through. In fact, all location conf API calls should be in a wrapper that sets
@@ -409,10 +401,10 @@ impl Reconciler {
&self.shard,
&self.config,
LocationConfigMode::AttachedSingle,
self.generation,
Some(self.generation),
None,
);
self.location_config(dest_ps_id, dest_final_conf.clone(), None, false)
self.location_config(dest_ps_id, dest_final_conf.clone(), None)
.await?;
self.observed.locations.insert(
dest_ps_id,
@@ -441,67 +433,24 @@ impl Reconciler {
// If the attached pageserver is not attached, do so now.
if let Some(node_id) = self.intent.attached {
// If we are in an attached policy, then generation must have been set (null generations
// are only present when a tenant is initially loaded with a secondary policy)
debug_assert!(self.generation.is_some());
let Some(generation) = self.generation else {
return Err(ReconcileError::Other(anyhow::anyhow!(
"Attempted to attach with NULL generation"
)));
};
let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
let mut wanted_conf =
attached_location_conf(self.generation, &self.shard, &self.config);
match self.observed.locations.get(&node_id) {
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
// Nothing to do
tracing::info!(%node_id, "Observed configuration already correct.")
tracing::info!("Observed configuration already correct.")
}
observed => {
_ => {
// In all cases other than a matching observed configuration, we will
// reconcile this location. This includes locations with different configurations, as well
// as locations with unknown (None) observed state.
// The general case is to increment the generation. However, there are cases
// where this is not necessary:
// - if we are only updating the TenantConf part of the location
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
// and the location was already in the correct generation
let increment_generation = match observed {
None => true,
Some(ObservedStateLocation { conf: None }) => true,
Some(ObservedStateLocation {
conf: Some(observed),
}) => {
let generations_match = observed.generation == wanted_conf.generation;
use LocationConfigMode::*;
let mode_transition_requires_gen_inc =
match (observed.mode, wanted_conf.mode) {
// Usually the short-lived attachment modes (multi and stale) are only used
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
(AttachedSingle, AttachedStale) => false,
(AttachedMulti, AttachedSingle) => false,
(lhs, rhs) => lhs != rhs,
};
!generations_match || mode_transition_requires_gen_inc
}
};
if increment_generation {
let generation = self
.persistence
.increment_generation(self.tenant_shard_id, node_id)
.await?;
self.generation = Some(generation);
wanted_conf.generation = generation.into();
}
tracing::info!(%node_id, "Observed configuration requires update.");
// Use lazy=true, because we may run many of Self concurrently, and do not want to
// overload the pageserver with logical size calculations.
self.location_config(node_id, wanted_conf, None, true)
self.generation = self
.persistence
.increment_generation(self.tenant_shard_id, node_id)
.await?;
wanted_conf.generation = self.generation.into();
tracing::info!("Observed configuration requires update.");
self.location_config(node_id, wanted_conf, None).await?;
self.compute_notify().await?;
}
}
@@ -553,7 +502,7 @@ impl Reconciler {
if self.cancel.is_cancelled() {
return Err(ReconcileError::Cancel);
}
self.location_config(node_id, conf, None, false).await?;
self.location_config(node_id, conf, None).await?;
}
Ok(())
@@ -565,12 +514,7 @@ impl Reconciler {
if let Some(node_id) = self.intent.attached {
let result = self
.compute_hook
.notify(
self.tenant_shard_id,
node_id,
self.shard.stripe_size,
&self.cancel,
)
.notify(self.tenant_shard_id, node_id, &self.cancel)
.await;
if let Err(e) = &result {
// It is up to the caller whether they want to drop out on this error, but they don't have to:

View File

@@ -175,34 +175,10 @@ impl Scheduler {
}
}
/// Where we have several nodes to choose from, for example when picking a secondary location
/// to promote to an attached location, this method may be used to pick the best choice based
/// on the scheduler's knowledge of utilization and availability.
///
/// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
/// caller can pick a node some other way.
pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
if nodes.is_empty() {
return None;
}
let node = nodes
.iter()
.map(|node_id| {
let may_schedule = self
.nodes
.get(node_id)
.map(|n| n.may_schedule)
.unwrap_or(false);
(*node_id, may_schedule)
})
.max_by_key(|(_n, may_schedule)| *may_schedule);
// If even the preferred node has may_schedule==false, return None
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
}
pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
pub(crate) fn schedule_shard(
&mut self,
hard_exclude: &[NodeId],
) -> Result<NodeId, ScheduleError> {
if self.nodes.is_empty() {
return Err(ScheduleError::NoPageservers);
}
@@ -251,44 +227,44 @@ impl Scheduler {
}
}
#[cfg(test)]
pub(crate) mod test_utils {
use crate::node::Node;
use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
use std::collections::HashMap;
use utils::id::NodeId;
/// Test helper: synthesize the requested number of nodes, all in active state.
///
/// Node IDs start at one.
pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
(1..n + 1)
.map(|i| {
(
NodeId(i),
Node {
id: NodeId(i),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
listen_http_addr: format!("httphost-{i}"),
listen_http_port: 80 + i as u16,
listen_pg_addr: format!("pghost-{i}"),
listen_pg_port: 5432 + i as u16,
},
)
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use utils::id::NodeId;
use crate::{node::Node, tenant_state::IntentState};
use crate::tenant_state::IntentState;
#[test]
fn scheduler_basic() -> anyhow::Result<()> {
let nodes = test_utils::make_test_nodes(2);
let mut nodes = HashMap::new();
nodes.insert(
NodeId(1),
Node {
id: NodeId(1),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
listen_http_addr: String::new(),
listen_http_port: 0,
listen_pg_addr: String::new(),
listen_pg_port: 0,
},
);
nodes.insert(
NodeId(2),
Node {
id: NodeId(2),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
listen_http_addr: String::new(),
listen_http_port: 0,
listen_pg_addr: String::new(),
listen_pg_port: 0,
},
);
let mut scheduler = Scheduler::new(nodes.values());
let mut t1_intent = IntentState::new();

View File

@@ -17,8 +17,8 @@ diesel::table! {
shard_number -> Int4,
shard_count -> Int4,
shard_stripe_size -> Int4,
generation -> Nullable<Int4>,
generation_pageserver -> Nullable<Int8>,
generation -> Int4,
generation_pageserver -> Int8,
placement_policy -> Varchar,
splitting -> Int2,
config -> Text,

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,7 @@
use std::{collections::HashMap, sync::Arc, time::Duration};
use crate::{metrics, persistence::TenantShardPersistence};
use pageserver_api::controller_api::NodeAvailability;
use control_plane::attachment_service::NodeAvailability;
use pageserver_api::{
models::{LocationConfig, LocationConfigMode, TenantConfig},
shard::{ShardIdentity, TenantShardId},
@@ -53,11 +53,8 @@ pub(crate) struct TenantState {
pub(crate) sequence: Sequence,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching.
//
// None represents an incompletely onboarded tenant via the [`Service::location_config`]
// API, where this tenant may only run in PlacementPolicy::Secondary.
pub(crate) generation: Option<Generation>,
// and use the incremented number when attaching
pub(crate) generation: Generation,
// High level description of how the tenant should be set up. Provided
// externally.
@@ -146,23 +143,6 @@ impl IntentState {
}
}
/// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from
/// secondary to attached while maintaining the scheduler's reference counts.
pub(crate) fn promote_attached(
&mut self,
_scheduler: &mut Scheduler,
promote_secondary: NodeId,
) {
// If we call this with a node that isn't in secondary, it would cause incorrect
// scheduler reference counting, since we assume the node is already referenced as a secondary.
debug_assert!(self.secondary.contains(&promote_secondary));
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
// need to call into it here.
self.secondary.retain(|n| n != &promote_secondary);
self.attached = Some(promote_secondary);
}
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
debug_assert!(!self.secondary.contains(&new_secondary));
scheduler.node_inc_ref(new_secondary);
@@ -184,13 +164,6 @@ impl IntentState {
}
}
/// Remove the last secondary node from the list of secondaries
pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
if let Some(node_id) = self.secondary.pop() {
scheduler.node_dec_ref(node_id);
}
}
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
if let Some(old_attached) = self.attached.take() {
scheduler.node_dec_ref(old_attached);
@@ -218,16 +191,12 @@ impl IntentState {
&self.secondary
}
/// If the node is in use as the attached location, demote it into
/// the list of secondary locations. This is used when a node goes offline,
/// and we want to use a different node for attachment, but not permanently
/// forget the location on the offline node.
/// When a node goes offline, we update intents to avoid using it
/// as their attached pageserver.
///
/// Returns true if a change was made
pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
if self.attached == Some(node_id) {
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
// need to call into it here.
self.attached = None;
self.secondary.push(node_id);
true
@@ -327,7 +296,7 @@ pub(crate) struct ReconcileResult {
pub(crate) result: Result<(), ReconcileError>,
pub(crate) tenant_shard_id: TenantShardId,
pub(crate) generation: Option<Generation>,
pub(crate) generation: Generation,
pub(crate) observed: ObservedState,
/// Set [`TenantState::pending_compute_notification`] from this flag
@@ -352,7 +321,7 @@ impl TenantState {
tenant_shard_id,
policy,
intent: IntentState::default(),
generation: Some(Generation::new(0)),
generation: Generation::new(0),
shard,
observed: ObservedState::default(),
config: TenantConfig::default(),
@@ -401,9 +370,6 @@ impl TenantState {
// All remaining observed locations generate secondary intents. This includes None
// observations, as these may well have some local content on disk that is usable (this
// is an edge case that might occur if we restarted during a migration or other change)
//
// We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`]
// will take care of promoting one of these secondaries to be attached.
self.observed.locations.keys().for_each(|node_id| {
if Some(*node_id) != self.intent.attached {
self.intent.secondary.push(*node_id);
@@ -411,33 +377,6 @@ impl TenantState {
});
}
/// Part of [`Self::schedule`] that is used to choose exactly one node to act as the
/// attached pageserver for a shard.
///
/// Returns whether we modified it, and the NodeId selected.
fn schedule_attached(
&mut self,
scheduler: &mut Scheduler,
) -> Result<(bool, NodeId), ScheduleError> {
// No work to do if we already have an attached tenant
if let Some(node_id) = self.intent.attached {
return Ok((false, node_id));
}
if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
// Promote a secondary
tracing::debug!("Promoted secondary {} to attached", promote_secondary);
self.intent.promote_attached(scheduler, promote_secondary);
Ok((true, promote_secondary))
} else {
// Pick a fresh node: either we had no secondaries or none were schedulable
let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
tracing::debug!("Selected {} as attached", node_id);
self.intent.set_attached(scheduler, Some(node_id));
Ok((true, node_id))
}
}
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
// TODO: before scheduling new nodes, check if any existing content in
// self.intent refers to pageservers that are offline, and pick other
@@ -448,49 +387,33 @@ impl TenantState {
// Build the set of pageservers already in use by this tenant, to avoid scheduling
// more work on the same pageservers we're already using.
let mut used_pageservers = self.intent.all_pageservers();
let mut modified = false;
// Add/remove nodes to fulfil policy
use PlacementPolicy::*;
match self.policy {
Single => {
// Should have exactly one attached, and zero secondaries
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
if self.intent.attached.is_none() {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.set_attached(scheduler, Some(node_id));
used_pageservers.push(node_id);
modified = true;
}
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
modified |= modified_attached;
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
modified = true;
}
}
Double(secondary_count) => {
let retain_secondaries = if self.intent.attached.is_none()
&& scheduler.node_preferred(&self.intent.secondary).is_some()
{
// If we have no attached, and one of the secondaries is elegible to be promoted, retain
// one more secondary than we usually would, as one of them will become attached futher down this function.
secondary_count + 1
} else {
secondary_count
};
while self.intent.secondary.len() > retain_secondaries {
// We have no particular preference for one secondary location over another: just
// arbitrarily drop from the end
self.intent.pop_secondary(scheduler);
// Should have exactly one attached, and N secondaries
if self.intent.attached.is_none() {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.set_attached(scheduler, Some(node_id));
used_pageservers.push(node_id);
modified = true;
}
// Should have exactly one attached, and N secondaries
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
modified |= modified_attached;
let mut used_pageservers = vec![attached_node_id];
while self.intent.secondary.len() < secondary_count {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.push_secondary(scheduler, node_id);
@@ -498,28 +421,15 @@ impl TenantState {
modified = true;
}
}
Secondary => {
if let Some(node_id) = self.intent.get_attached() {
// Populate secondary by demoting the attached node
self.intent.demote_attached(*node_id);
modified = true;
} else if self.intent.secondary.is_empty() {
// Populate secondary by scheduling a fresh node
let node_id = scheduler.schedule_shard(&[])?;
self.intent.push_secondary(scheduler, node_id);
modified = true;
}
while self.intent.secondary.len() > 1 {
// We have no particular preference for one secondary location over another: just
// arbitrarily drop from the end
self.intent.pop_secondary(scheduler);
modified = true;
}
}
Detached => {
// Never add locations in this mode
if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
self.intent.clear(scheduler);
// Should have no attached or secondary pageservers
if self.intent.attached.is_some() {
self.intent.set_attached(scheduler, None);
modified = true;
}
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
modified = true;
}
}
@@ -566,12 +476,7 @@ impl TenantState {
fn dirty(&self) -> bool {
if let Some(node_id) = self.intent.attached {
// Maybe panic: it is a severe bug if we try to attach while generation is null.
let generation = self
.generation
.expect("Attempted to enter attached state without a generation");
let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
match self.observed.locations.get(&node_id) {
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
Some(_) | None => {
@@ -590,13 +495,6 @@ impl TenantState {
}
}
for node_id in self.observed.locations.keys() {
if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
// We have observed state that isn't part of our intent: need to clean it up.
return true;
}
}
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
// wake up a reconciler to send it.
if self.pending_compute_notification {
@@ -649,10 +547,6 @@ impl TenantState {
// Reconcile already in flight for the current sequence?
if let Some(handle) = &self.reconciler {
if handle.sequence == self.sequence {
tracing::info!(
"Reconciliation already in progress for sequence {:?}",
self.sequence,
);
return Some(ReconcilerWaiter {
tenant_shard_id: self.tenant_shard_id,
seq_wait: self.waiter.clone(),
@@ -672,10 +566,6 @@ impl TenantState {
return None;
};
// Advance the sequence before spawning a reconciler, so that sequence waiters
// can distinguish between before+after the reconcile completes.
self.sequence = self.sequence.next();
let reconciler_cancel = cancel.child_token();
let mut reconciler = Reconciler {
tenant_shard_id: self.tenant_shard_id,
@@ -777,17 +667,6 @@ impl TenantState {
})
}
/// Called when a ReconcileResult has been emitted and the service is updating
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
/// the handle to indicate there is no longer a reconciliation in progress.
pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
if let Some(reconcile_handle) = &self.reconciler {
if reconcile_handle.sequence <= sequence {
self.reconciler = None;
}
}
}
// If we had any state at all referring to this node ID, drop it. Does not
// attempt to reschedule.
pub(crate) fn deref_node(&mut self, node_id: NodeId) {
@@ -808,93 +687,11 @@ impl TenantState {
shard_number: self.tenant_shard_id.shard_number.0 as i32,
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: self.shard.stripe_size.0 as i32,
generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
generation: self.generation.into().unwrap_or(0) as i32,
generation_pageserver: i64::MAX,
placement_policy: serde_json::to_string(&self.policy).unwrap(),
config: serde_json::to_string(&self.config).unwrap(),
splitting: SplitState::default(),
}
}
}
#[cfg(test)]
pub(crate) mod tests {
use pageserver_api::shard::{ShardCount, ShardNumber};
use utils::id::TenantId;
use crate::scheduler::test_utils::make_test_nodes;
use super::*;
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
let tenant_id = TenantId::generate();
let shard_number = ShardNumber(0);
let shard_count = ShardCount::new(1);
let tenant_shard_id = TenantShardId {
tenant_id,
shard_number,
shard_count,
};
TenantState::new(
tenant_shard_id,
ShardIdentity::new(
shard_number,
shard_count,
pageserver_api::shard::ShardStripeSize(32768),
)
.unwrap(),
policy,
)
}
/// Test the scheduling behaviors used when a tenant configured for HA is subject
/// to nodes being marked offline.
#[test]
fn tenant_ha_scheduling() -> anyhow::Result<()> {
// Start with three nodes. Our tenant will only use two. The third one is
// expected to remain unused.
let mut nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
tenant_state
.schedule(&mut scheduler)
.expect("we have enough nodes, scheduling should work");
// Expect to initially be schedule on to different nodes
assert_eq!(tenant_state.intent.secondary.len(), 1);
assert!(tenant_state.intent.attached.is_some());
let attached_node_id = tenant_state.intent.attached.unwrap();
let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
assert_ne!(attached_node_id, secondary_node_id);
// Notifying the attached node is offline should demote it to a secondary
let changed = tenant_state.intent.demote_attached(attached_node_id);
assert!(changed);
assert!(tenant_state.intent.attached.is_none());
assert_eq!(tenant_state.intent.secondary.len(), 2);
// Update the scheduler state to indicate the node is offline
nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
// Scheduling the node should promote the still-available secondary node to attached
tenant_state
.schedule(&mut scheduler)
.expect("active nodes are available");
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
// The original attached node should have been retained as a secondary
assert_eq!(
*tenant_state.intent.secondary.iter().last().unwrap(),
attached_node_id
);
tenant_state.intent.clear(&mut scheduler);
Ok(())
}
}

View File

@@ -2,12 +2,8 @@ use crate::{background_process, local_env::LocalEnv};
use camino::{Utf8Path, Utf8PathBuf};
use hyper::Method;
use pageserver_api::{
controller_api::{
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
TenantShardMigrateRequest, TenantShardMigrateResponse,
},
models::{
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
TimelineCreateRequest, TimelineInfo,
},
shard::TenantShardId,
@@ -15,12 +11,12 @@ use pageserver_api::{
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{fs, str::FromStr};
use std::str::FromStr;
use tokio::process::Command;
use tracing::instrument;
use url::Url;
use utils::{
auth::{encode_from_key_file, Claims, Scope},
auth::{Claims, Scope},
id::{NodeId, TenantId},
};
@@ -28,7 +24,7 @@ pub struct AttachmentService {
env: LocalEnv,
listen: String,
path: Utf8PathBuf,
private_key: Option<Vec<u8>>,
jwt_token: Option<String>,
public_key: Option<String>,
postgres_port: u16,
client: reqwest::Client,
@@ -59,6 +55,126 @@ pub struct InspectResponse {
pub attachment: Option<(u32, NodeId)>,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub generation: u32,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponse {
pub shards: Vec<TenantCreateResponseShard>,
}
#[derive(Serialize, Deserialize)]
pub struct NodeRegisterRequest {
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_http_addr: String,
pub listen_http_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct NodeConfigureRequest {
pub node_id: NodeId,
pub availability: Option<NodeAvailability>,
pub scheduling: Option<NodeSchedulingPolicy>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantLocateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_http_addr: String,
pub listen_http_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct TenantLocateResponse {
pub shards: Vec<TenantLocateResponseShard>,
pub shard_params: ShardParameters,
}
/// Explicitly migrating a particular shard is a low level operation
/// TODO: higher level "Reschedule tenant" operation where the request
/// specifies some constraints, e.g. asking it to get off particular node(s)
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: NodeId,
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
pub enum NodeAvailability {
// Normal, happy state
Active,
// Offline: Tenants shouldn't try to attach here, but they may assume that their
// secondary locations on this node still exist. Newly added nodes are in this
// state until we successfully contact them.
Offline,
}
impl FromStr for NodeAvailability {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self::Active),
"offline" => Ok(Self::Offline),
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
}
}
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
/// type needs to be defined with diesel traits in there.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
pub enum NodeSchedulingPolicy {
Active,
Filling,
Pause,
Draining,
}
impl FromStr for NodeSchedulingPolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self::Active),
"filling" => Ok(Self::Filling),
"pause" => Ok(Self::Pause),
"draining" => Ok(Self::Draining),
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
}
}
}
impl From<NodeSchedulingPolicy> for String {
fn from(value: NodeSchedulingPolicy) -> String {
use NodeSchedulingPolicy::*;
match value {
Active => "active",
Filling => "filling",
Pause => "pause",
Draining => "draining",
}
.to_string()
}
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateResponse {}
impl AttachmentService {
pub fn from_env(env: &LocalEnv) -> Self {
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
@@ -88,11 +204,12 @@ impl AttachmentService {
.pageservers
.first()
.expect("Config is validated to contain at least one pageserver");
let (private_key, public_key) = match ps_conf.http_auth_type {
let (jwt_token, public_key) = match ps_conf.http_auth_type {
AuthType::Trust => (None, None),
AuthType::NeonJWT => {
let private_key_path = env.get_private_key_path();
let private_key = fs::read(private_key_path).expect("failed to read private key");
let jwt_token = env
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
.unwrap();
// If pageserver auth is enabled, this implicitly enables auth for this service,
// using the same credentials.
@@ -118,7 +235,7 @@ impl AttachmentService {
} else {
std::fs::read_to_string(&public_key_path).expect("Can't read public key")
};
(Some(private_key), Some(public_key))
(Some(jwt_token), Some(public_key))
}
};
@@ -126,7 +243,7 @@ impl AttachmentService {
env: env.clone(),
path,
listen,
private_key,
jwt_token,
public_key,
postgres_port,
client: reqwest::ClientBuilder::new()
@@ -200,7 +317,7 @@ impl AttachmentService {
"localhost",
"-p",
&format!("{}", self.postgres_port),
DB_NAME,
&DB_NAME,
])
.output()
.await
@@ -280,10 +397,7 @@ impl AttachmentService {
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
if let Some(private_key) = &self.private_key {
let claims = Claims::new(None, Scope::PageServerApi);
let jwt_token =
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
if let Some(jwt_token) = &self.jwt_token {
args.push(format!("--jwt-token={jwt_token}"));
}
@@ -308,7 +422,7 @@ impl AttachmentService {
)],
background_process::InitialPidFile::Create(self.pid_file()),
|| async {
match self.ready().await {
match self.status().await {
Ok(_) => Ok(true),
Err(_) => Ok(false),
}
@@ -354,20 +468,6 @@ impl AttachmentService {
Ok(())
}
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
let category = match path.find('/') {
Some(idx) => &path[..idx],
None => path,
};
match category {
"status" | "ready" => Ok(None),
"control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
"v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
_ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
}
}
/// Simple HTTP request wrapper for calling into attachment service
async fn dispatch<RQ, RS>(
&self,
@@ -393,16 +493,11 @@ impl AttachmentService {
if let Some(body) = body {
builder = builder.json(&body)
}
if let Some(private_key) = &self.private_key {
println!("Getting claims for path {}", path);
if let Some(required_claims) = Self::get_claims_for_path(&path)? {
println!("Got claims {:?} for path {}", required_claims, path);
let jwt_token = encode_from_key_file(&required_claims, private_key)?;
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
if let Some(jwt_token) = &self.jwt_token {
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
let response = builder.send().await?;
@@ -522,8 +617,8 @@ impl AttachmentService {
}
#[instrument(skip(self))]
pub async fn ready(&self) -> anyhow::Result<()> {
self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
pub async fn status(&self) -> anyhow::Result<()> {
self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
.await
}

View File

@@ -8,15 +8,14 @@
use anyhow::{anyhow, bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
use compute_api::spec::ComputeMode;
use control_plane::attachment_service::AttachmentService;
use control_plane::attachment_service::{
AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
};
use control_plane::endpoint::ComputeControlPlane;
use control_plane::local_env::{InitForceMode, LocalEnv};
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::{broker, local_env};
use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
};
use pageserver_api::models::{
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
};
@@ -653,10 +652,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
let name = import_match
.get_one::<String>("node-name")
.ok_or_else(|| anyhow!("No node name provided"))?;
let update_catalog = import_match
.get_one::<bool>("update-catalog")
.cloned()
.unwrap_or_default();
// Parse base inputs
let base_tarfile = import_match
@@ -699,7 +694,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
None,
pg_version,
ComputeMode::Primary,
!update_catalog,
)?;
println!("Done");
}
@@ -837,10 +831,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.get_one::<String>("endpoint_id")
.map(String::to_string)
.unwrap_or_else(|| format!("ep-{branch_name}"));
let update_catalog = sub_args
.get_one::<bool>("update-catalog")
.cloned()
.unwrap_or_default();
let lsn = sub_args
.get_one::<String>("lsn")
@@ -890,7 +880,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
http_port,
pg_version,
mode,
!update_catalog,
)?;
}
"start" => {
@@ -929,11 +918,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.get(endpoint_id.as_str())
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
let create_test_user = sub_args
.get_one::<bool>("create-test-user")
.cloned()
.unwrap_or_default();
cplane.check_conflicting_endpoints(
endpoint.mode,
endpoint.tenant_id,
@@ -988,7 +972,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
pageservers,
remote_ext_config,
stripe_size.0 as usize,
create_test_user,
)
.await?;
}
@@ -1024,7 +1007,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
})
.collect::<Vec<_>>()
};
endpoint.reconfigure(pageservers, None).await?;
endpoint.reconfigure(pageservers).await?;
}
"stop" => {
let endpoint_id = sub_args
@@ -1474,18 +1457,6 @@ fn cli() -> Command {
.required(false)
.default_value("1");
let update_catalog = Arg::new("update-catalog")
.value_parser(value_parser!(bool))
.long("update-catalog")
.help("If set, will set up the catalog for neon_superuser")
.required(false);
let create_test_user = Arg::new("create-test-user")
.value_parser(value_parser!(bool))
.long("create-test-user")
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
.required(false);
Command::new("Neon CLI")
.arg_required_else_help(true)
.version(GIT_VERSION)
@@ -1546,7 +1517,6 @@ fn cli() -> Command {
.arg(Arg::new("end-lsn").long("end-lsn")
.help("Lsn the basebackup ends at"))
.arg(pg_version_arg.clone())
.arg(update_catalog.clone())
)
).subcommand(
Command::new("tenant")
@@ -1660,7 +1630,6 @@ fn cli() -> Command {
.required(false))
.arg(pg_version_arg.clone())
.arg(hot_standby_arg.clone())
.arg(update_catalog)
)
.subcommand(Command::new("start")
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1668,7 +1637,6 @@ fn cli() -> Command {
.arg(endpoint_pageserver_id_arg.clone())
.arg(safekeepers_arg)
.arg(remote_ext_config_args)
.arg(create_test_user)
)
.subcommand(Command::new("reconfigure")
.about("Reconfigure the endpoint")

View File

@@ -41,18 +41,13 @@ use std::net::SocketAddr;
use std::net::TcpStream;
use std::path::PathBuf;
use std::process::Command;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::spec::Database;
use compute_api::spec::PgIdent;
use compute_api::spec::RemoteExtSpec;
use compute_api::spec::Role;
use nix::sys::signal::kill;
use nix::sys::signal::Signal;
use pageserver_api::shard::ShardStripeSize;
use serde::{Deserialize, Serialize};
use url::Host;
use utils::id::{NodeId, TenantId, TimelineId};
@@ -127,7 +122,6 @@ impl ComputeControlPlane {
http_port: Option<u16>,
pg_version: u32,
mode: ComputeMode,
skip_pg_catalog_updates: bool,
) -> Result<Arc<Endpoint>> {
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -146,7 +140,7 @@ impl ComputeControlPlane {
// before and after start are the same. So, skip catalog updates,
// with this we basically test a case of waking up an idle compute, where
// we also skip catalog updates in the cloud.
skip_pg_catalog_updates,
skip_pg_catalog_updates: true,
features: vec![],
});
@@ -161,7 +155,7 @@ impl ComputeControlPlane {
http_port,
pg_port,
pg_version,
skip_pg_catalog_updates,
skip_pg_catalog_updates: true,
features: vec![],
})?,
)?;
@@ -506,7 +500,6 @@ impl Endpoint {
pageservers: Vec<(Host, u16)>,
remote_ext_config: Option<&String>,
shard_stripe_size: usize,
create_test_user: bool,
) -> Result<()> {
if self.status() == EndpointStatus::Running {
anyhow::bail!("The endpoint is already running");
@@ -558,26 +551,8 @@ impl Endpoint {
cluster_id: None, // project ID: not used
name: None, // project name: not used
state: None,
roles: if create_test_user {
vec![Role {
name: PgIdent::from_str("test").unwrap(),
encrypted_password: None,
options: None,
}]
} else {
Vec::new()
},
databases: if create_test_user {
vec![Database {
name: PgIdent::from_str("neondb").unwrap(),
owner: PgIdent::from_str("test").unwrap(),
options: None,
restrict_conn: false,
invalid: false,
}]
} else {
Vec::new()
},
roles: vec![],
databases: vec![],
settings: None,
postgresql_conf: Some(postgresql_conf),
},
@@ -591,7 +566,6 @@ impl Endpoint {
remote_extensions,
pgbouncer_settings: None,
shard_stripe_size: Some(shard_stripe_size),
primary_is_running: None,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -603,16 +577,11 @@ impl Endpoint {
.open(self.endpoint_path().join("compute.log"))?;
// Launch compute_ctl
let conn_str = self.connstr("cloud_admin", "postgres");
println!("Starting postgres node at '{}'", conn_str);
if create_test_user {
let conn_str = self.connstr("test", "neondb");
println!("Also at '{}'", conn_str);
}
println!("Starting postgres node at '{}'", self.connstr());
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
cmd.args(["--http-port", &self.http_address.port().to_string()])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &conn_str])
.args(["--connstr", &self.connstr()])
.args([
"--spec-path",
self.endpoint_path().join("spec.json").to_str().unwrap(),
@@ -683,9 +652,7 @@ impl Endpoint {
}
ComputeStatus::Empty
| ComputeStatus::ConfigurationPending
| ComputeStatus::Configuration
| ComputeStatus::TerminationPending
| ComputeStatus::Terminated => {
| ComputeStatus::Configuration => {
bail!("unexpected compute status: {:?}", state.status)
}
}
@@ -736,11 +703,7 @@ impl Endpoint {
}
}
pub async fn reconfigure(
&self,
mut pageservers: Vec<(Host, u16)>,
stripe_size: Option<ShardStripeSize>,
) -> Result<()> {
pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
let mut spec: ComputeSpec = {
let spec_path = self.endpoint_path().join("spec.json");
let file = std::fs::File::open(spec_path)?;
@@ -770,9 +733,6 @@ impl Endpoint {
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
assert!(!pageserver_connstr.is_empty());
spec.pageserver_connstring = Some(pageserver_connstr);
if stripe_size.is_some() {
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
}
let client = reqwest::Client::new();
let response = client
@@ -823,13 +783,13 @@ impl Endpoint {
Ok(())
}
pub fn connstr(&self, user: &str, db_name: &str) -> String {
pub fn connstr(&self) -> String {
format!(
"postgresql://{}@{}:{}/{}",
user,
"cloud_admin",
self.pg_address.ip(),
self.pg_address.port(),
db_name
"postgres"
)
}
}

View File

@@ -412,17 +412,14 @@ impl LocalEnv {
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
let private_key_path = self.get_private_key_path();
let key_data = fs::read(private_key_path)?;
encode_from_key_file(claims, &key_data)
}
pub fn get_private_key_path(&self) -> PathBuf {
if self.private_key_path.is_absolute() {
let private_key_path = if self.private_key_path.is_absolute() {
self.private_key_path.to_path_buf()
} else {
self.base_data_dir.join(&self.private_key_path)
}
};
let key_data = fs::read(private_key_path)?;
encode_from_key_file(claims, &key_data)
}
//

View File

@@ -17,7 +17,6 @@ use std::time::Duration;
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use futures::SinkExt;
use pageserver_api::controller_api::NodeRegisterRequest;
use pageserver_api::models::{
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
};
@@ -31,7 +30,7 @@ use utils::{
lsn::Lsn,
};
use crate::attachment_service::AttachmentService;
use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
use crate::local_env::PageServerConf;
use crate::{background_process, local_env::LocalEnv};
@@ -116,7 +115,7 @@ impl PageServerNode {
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
let jwt_token = self
.env
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
.unwrap();
overrides.push(format!("control_plane_api_token='{}'", jwt_token));
}
@@ -211,25 +210,6 @@ impl PageServerNode {
update_config: bool,
register: bool,
) -> anyhow::Result<()> {
// Register the node with the storage controller before starting pageserver: pageserver must be registered to
// successfully call /re-attach and finish starting up.
if register {
let attachment_service = AttachmentService::from_env(&self.env);
let (pg_host, pg_port) =
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
attachment_service
.node_register(NodeRegisterRequest {
node_id: self.conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
}
// TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path();
print!(
@@ -268,6 +248,23 @@ impl PageServerNode {
)
.await?;
if register {
let attachment_service = AttachmentService::from_env(&self.env);
let (pg_host, pg_port) =
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
attachment_service
.node_register(NodeRegisterRequest {
node_id: self.conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
}
Ok(())
}
@@ -353,11 +350,6 @@ impl PageServerNode {
.remove("compaction_threshold")
.map(|x| x.parse::<usize>())
.transpose()?,
compaction_algorithm: settings
.remove("compaction_algorithm")
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'compaction_algorithm' json")?,
gc_horizon: settings
.remove("gc_horizon")
.map(|x| x.parse::<u64>())
@@ -397,6 +389,11 @@ impl PageServerNode {
evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold")
.map(|x| x.to_string()),
gc_feedback: settings
.remove("gc_feedback")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
@@ -461,11 +458,6 @@ impl PageServerNode {
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'compaction_threshold' as an integer")?,
compaction_algorithm: settings
.remove("compactin_algorithm")
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'compaction_algorithm' json")?,
gc_horizon: settings
.remove("gc_horizon")
.map(|x| x.parse::<u64>())
@@ -507,6 +499,11 @@ impl PageServerNode {
evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold")
.map(|x| x.to_string()),
gc_feedback: settings
.remove("gc_feedback")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
@@ -537,11 +534,10 @@ impl PageServerNode {
tenant_shard_id: TenantShardId,
config: LocationConfig,
flush_ms: Option<Duration>,
lazy: bool,
) -> anyhow::Result<()> {
Ok(self
.http_client
.location_config(tenant_shard_id, config, flush_ms, lazy)
.location_config(tenant_shard_id, config, flush_ms)
.await?)
}
@@ -606,7 +602,7 @@ impl PageServerNode {
eprintln!("connection error: {}", e);
}
});
let client = std::pin::pin!(client);
tokio::pin!(client);
// Init base reader
let (start_lsn, base_tarfile_path) = base;

View File

@@ -70,9 +70,6 @@ Should only be used e.g. for status check/tenant creation/list.
Should only be used e.g. for status check.
Currently also used for connection from any pageserver to any safekeeper.
"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
"admin": Provides access to the control plane and admin APIs of the attachment service.
### CLI
CLI generates a key pair during call to `neon_local init` with the following commands:

View File

@@ -1,4 +1,4 @@
# Neon storage node — alternative
# Zenith storage node — alternative
## **Design considerations**

View File

@@ -1,6 +1,6 @@
# Command line interface (end-user)
Neon CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside neon distribution at least at the start.
Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start.
This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots.
@@ -8,40 +8,40 @@ The most important concept here is a snapshot, which can be created/pushed/pulle
# Possible usage scenarios
## Install neon, run a postgres
## Install zenith, run a postgres
```
> brew install pg-neon
> neon pg create # creates pgdata with default pattern pgdata$i
> neon pg list
> brew install pg-zenith
> zenith pg create # creates pgdata with default pattern pgdata$i
> zenith pg list
ID PGDATA USED STORAGE ENDPOINT
primary1 pgdata1 0G neon-local localhost:5432
primary1 pgdata1 0G zenith-local localhost:5432
```
## Import standalone postgres to neon
## Import standalone postgres to zenith
```
> neon snapshot import --from=basebackup://replication@localhost:5432/ oldpg
> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg
[====================------------] 60% | 20MB/s
> neon snapshot list
> zenith snapshot list
ID SIZE PARENT
oldpg 5G -
> neon pg create --snapshot oldpg
> zenith pg create --snapshot oldpg
Started postgres on localhost:5432
> neon pg list
> zenith pg list
ID PGDATA USED STORAGE ENDPOINT
primary1 pgdata1 5G neon-local localhost:5432
primary1 pgdata1 5G zenith-local localhost:5432
> neon snapshot destroy oldpg
> zenith snapshot destroy oldpg
Ok
```
Also, we may start snapshot import implicitly by looking at snapshot schema
```
> neon pg create --snapshot basebackup://replication@localhost:5432/
> zenith pg create --snapshot basebackup://replication@localhost:5432/
Downloading snapshot... Done.
Started postgres on localhost:5432
Destroying snapshot... Done.
@@ -52,39 +52,39 @@ Destroying snapshot... Done.
Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage).
```
> neon pg create --snapshot http://learn-postgres.com/movies_db.neon movies
> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies
```
## Create snapshot and push it to the cloud
```
> neon snapshot create pgdata1@snap1
> neon snapshot push --to ssh://stas@neon.tech pgdata1@snap1
> zenith snapshot create pgdata1@snap1
> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1
```
## Rollback database to the snapshot
One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `neon pg checkout`.
One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`.
```
> neon pg list
> zenith pg list
ID PGDATA USED STORAGE ENDPOINT
primary1 pgdata1 5G neon-local localhost:5432
primary1 pgdata1 5G zenith-local localhost:5432
> neon snapshot create pgdata1@snap1
> zenith snapshot create pgdata1@snap1
> neon snapshot list
> zenith snapshot list
ID SIZE PARENT
oldpg 5G -
pgdata1@snap1 6G -
pgdata1@CURRENT 6G -
> neon pg checkout pgdata1@snap1
> zenith pg checkout pgdata1@snap1
Stopping postgres on pgdata1.
Rolling back pgdata1@CURRENT to pgdata1@snap1.
Starting postgres on pgdata1.
> neon snapshot list
> zenith snapshot list
ID SIZE PARENT
oldpg 5G -
pgdata1@snap1 6G -
@@ -99,7 +99,7 @@ Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state
PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite).
```
> neon pitr create --storage s3tank --ttl 30d --name pitr_last_month
> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month
```
Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area.
@@ -108,29 +108,29 @@ Resetting the database to some state in past would require creating a snapshot o
## storage
Storage is either neon pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
**neon storage attach** -t [native|s3] -c key=value -n name
**zenith storage attach** -t [native|s3] -c key=value -n name
Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=neon.tech/stas/mystore. Other possible term for native is 'zstore'.
Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'.
**neon storage list**
**zenith storage list**
Show currently attached storages. For example:
```
> neon storage list
> zenith storage list
NAME USED TYPE OPTIONS PATH
local 5.1G neon-local /opt/neon/store/local
local.compr 20.4G neon-local compression=on /opt/neon/store/local.compr
zcloud 60G neon-remote neon.tech/stas/mystore
local 5.1G zenith-local /opt/zenith/store/local
local.compr 20.4G zenith-local compression=on /opt/zenith/store/local.compr
zcloud 60G zenith-remote zenith.tech/stas/mystore
s3tank 80G S3
```
**neon storage detach**
**zenith storage detach**
**neon storage show**
**zenith storage show**
@@ -140,29 +140,29 @@ Manages postgres data directories and can start postgres instances with proper c
Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together.
**neon pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr.
--no-start: just init datadir without creating
--snapshot snap: init from the snapshot. Snap is a name or URL (neon.tech/stas/mystore/snap1)
--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1)
--cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database)
**neon pg destroy**
**zenith pg destroy**
**neon pg start** [--replica] pgdata
**zenith pg start** [--replica] pgdata
Start postgres with proper extensions preloaded/installed.
**neon pg checkout**
**zenith pg checkout**
Rollback data directory to some previous snapshot.
**neon pg stop** pg_id
**zenith pg stop** pg_id
**neon pg list**
**zenith pg list**
```
ROLE PGDATA USED STORAGE ENDPOINT
@@ -173,7 +173,7 @@ primary my_pg2 3.2G local.compr localhost:5435
- my_pg3 9.2G local.compr -
```
**neon pg show**
**zenith pg show**
```
my_pg:
@@ -194,7 +194,7 @@ my_pg:
```
**neon pg start-rest/graphql** pgdata
**zenith pg start-rest/graphql** pgdata
Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea.
@@ -203,35 +203,35 @@ Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that,
Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout.
**neon snapshot create** pgdata_name@snap_name
**zenith snapshot create** pgdata_name@snap_name
Creates a new snapshot in the same storage where pgdata_name exists.
**neon snapshot push** --to url pgdata_name@snap_name
**zenith snapshot push** --to url pgdata_name@snap_name
Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `neon snapshot recv` before push happens. If url has some special schema like neon:// receiving side may require auth start `neon snapshot recv` on the go.
Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go.
**neon snapshot recv**
**zenith snapshot recv**
Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket.
**neon snapshot pull** --from url or path
**zenith snapshot pull** --from url or path
Connects to a remote neon/s3/file and pulls snapshot. The remote site should be neon service or files in our format.
Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format.
**neon snapshot import** --from basebackup://<...> or path
**zenith snapshot import** --from basebackup://<...> or path
Creates a new snapshot out of running postgres via basebackup protocol or basebackup files.
**neon snapshot export**
**zenith snapshot export**
Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be neon own format which is handy for us (but I think just tar of basebackup would be okay).
Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay).
**neon snapshot diff** snap1 snap2
**zenith snapshot diff** snap1 snap2
Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses.
**neon snapshot destroy**
**zenith snapshot destroy**
## pitr
@@ -239,7 +239,7 @@ Pitr represents wal stream and ttl policy for that stream
XXX: any suggestions on a better name?
**neon pitr create** name
**zenith pitr create** name
--ttl = inf | period
@@ -247,21 +247,21 @@ XXX: any suggestions on a better name?
--storage = storage_name
**neon pitr extract-snapshot** pitr_name --lsn xxx
**zenith pitr extract-snapshot** pitr_name --lsn xxx
Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export)
**neon pitr gc** pitr_name
**zenith pitr gc** pitr_name
Force garbage collection on some PITR area.
**neon pitr list**
**zenith pitr list**
**neon pitr destroy**
**zenith pitr destroy**
## console
**neon console**
**zenith console**
Opens browser targeted at web console with the more or less same functionality as described here.

View File

@@ -6,7 +6,7 @@ When do we consider the WAL record as durable, so that we can
acknowledge the commit to the client and be reasonably certain that we
will not lose the transaction?
Neon uses a group of WAL safekeeper nodes to hold the generated WAL.
Zenith uses a group of WAL safekeeper nodes to hold the generated WAL.
A WAL record is considered durable, when it has been written to a
majority of WAL safekeeper nodes. In this document, I use 5
safekeepers, because I have five fingers. A WAL record is durable,

View File

@@ -1,23 +1,23 @@
# Neon local
# Zenith local
Here I list some objectives to keep in mind when discussing neon-local design and a proposal that brings all components together. Your comments on both parts are very welcome.
Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome.
#### Why do we need it?
- For distribution - this easy to use binary will help us to build adoption among developers.
- For internal use - to test all components together.
In my understanding, we consider it to be just a mock-up version of neon-cloud.
In my understanding, we consider it to be just a mock-up version of zenith-cloud.
> Question: How much should we care about durability and security issues for a local setup?
#### Why is it better than a simple local postgres?
- Easy one-line setup. As simple as `cargo install neon && neon start`
- Easy one-line setup. As simple as `cargo install zenith && zenith start`
- Quick and cheap creation of compute nodes over the same storage.
> Question: How can we describe a use-case for this feature?
- Neon-local can work with S3 directly.
- Zenith-local can work with S3 directly.
- Push and pull images (snapshots) to remote S3 to exchange data with other users.
@@ -31,50 +31,50 @@ Ideally, just one binary that incorporates all elements we need.
#### Components:
- **neon-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way.
CLI proposal is here https://github.com/neondatabase/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src/bin/cli
- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way.
CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli
- **neon-console** - WEB UI with same functionality as CLI.
- **zenith-console** - WEB UI with same functionality as CLI.
>Note: not for the first release.
- **neon-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
> Idea: spawn all other components as child processes, so that we could shutdown everything by stopping neon-local.
- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
> Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local.
- **neon-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
> Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server?
WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src
WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src
- **neon-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to neon.
- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith.
> Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)?
> Question: Do we use it together with local page store or they are interchangeable?
WIP code is ???
- **neon-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
> Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system.
WIP code is here: https://github.com/neondatabase/postgres/tree/main/src/bin/safekeeper
WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper
- **neon-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
WIP code is in main branch and here: https://github.com/neondatabase/postgres/commits/compute_node
WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node
#### REST API:
Service endpoint: `http://localhost:3000`
Resources:
- /storages - Where data lives: neon-pageserver or neon-s3
- /pgs - Postgres - neon-computenode
- /storages - Where data lives: zenith-pageserver or zenith-s3
- /pgs - Postgres - zenith-computenode
- /snapshots - snapshots **TODO**
>Question: Do we want to extend this API to manage neon components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
Methods and their mapping to CLI:
- /storages - neon-pageserver or neon-s3
- /storages - zenith-pageserver or zenith-s3
CLI | REST API
------------- | -------------
@@ -84,7 +84,7 @@ storage list | GET /storages
storage show -n name | GET /storages/:storage_name
- /pgs - neon-computenode
- /pgs - zenith-computenode
CLI | REST API
------------- | -------------

View File

@@ -1,45 +1,45 @@
Neon CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
# CLI v2 (after chatting with Carl)
Neon introduces the notion of a repository.
Zenith introduces the notion of a repository.
```bash
neon init
neon clone neon://neon.tech/piedpiper/northwind -- clones a repo to the northwind directory
zenith init
zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory
```
Once you have a cluster catalog you can explore it
```bash
neon log -- returns a list of commits
neon status -- returns if there are changes in the catalog that can be committed
neon commit -- commits the changes and generates a new commit hash
neon branch experimental <hash> -- creates a branch called testdb based on a given commit hash
zenith log -- returns a list of commits
zenith status -- returns if there are changes in the catalog that can be committed
zenith commit -- commits the changes and generates a new commit hash
zenith branch experimental <hash> -- creates a branch called testdb based on a given commit hash
```
To make changes in the catalog you need to run compute nodes
```bash
-- here is how you a compute node
neon start /home/pipedpiper/northwind:main -- starts a compute instance
neon start neon://neon.tech/northwind:main -- starts a compute instance in the cloud
zenith start /home/pipedpiper/northwind:main -- starts a compute instance
zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud
-- you can start a compute node against any hash or branch
neon start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
-- you can start a compute node against any hash or branch
neon start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
-- After running some DML you can run
-- neon status and see how there are two WAL streams one on top of
-- zenith status and see how there are two WAL streams one on top of
-- the main branch
neon status
zenith status
-- and another on top of the experimental branch
neon status -b experimental
zenith status -b experimental
-- you can commit each branch separately
neon commit main
zenith commit main
-- or
neon commit -c /home/pipedpiper/northwind:experimental
zenith commit -c /home/pipedpiper/northwind:experimental
```
Starting compute instances against cloud environments
@@ -47,18 +47,18 @@ Starting compute instances against cloud environments
```bash
-- you can start a compute instance against the cloud environment
-- in this case all of the changes will be streamed into the cloud
neon start https://neon:tecj/pipedpiper/northwind:main
neon start https://neon:tecj/pipedpiper/northwind:main
neon status -c https://neon:tecj/pipedpiper/northwind:main
neon commit -c https://neon:tecj/pipedpiper/northwind:main
neon branch -c https://neon:tecj/pipedpiper/northwind:<hash> experimental
zenith start https://zenith:tech/pipedpiper/northwind:main
zenith start https://zenith:tech/pipedpiper/northwind:main
zenith status -c https://zenith:tech/pipedpiper/northwind:main
zenith commit -c https://zenith:tech/pipedpiper/northwind:main
zenith branch -c https://zenith:tech/pipedpiper/northwind:<hash> experimental
```
Pushing data into the cloud
```bash
-- pull all the commits from the cloud
neon pull
zenith pull
-- push all the commits to the cloud
neon push
zenith push
```

View File

@@ -1,14 +1,14 @@
# Repository format
A Neon repository is similar to a traditional PostgreSQL backup
A Zenith repository is similar to a traditional PostgreSQL backup
archive, like a WAL-G bucket or pgbarman backup catalogue. It holds
multiple versions of a PostgreSQL database cluster.
The distinguishing feature is that you can launch a Neon Postgres
The distinguishing feature is that you can launch a Zenith Postgres
server directly against a branch in the repository, without having to
"restore" it first. Also, Neon manages the storage automatically,
"restore" it first. Also, Zenith manages the storage automatically,
there is no separation between full and incremental backups nor WAL
archive. Neon relies heavily on the WAL, and uses concepts similar
archive. Zenith relies heavily on the WAL, and uses concepts similar
to incremental backups and WAL archiving internally, but it is hidden
from the user.
@@ -19,15 +19,15 @@ efficient. Just something to get us started.
The repository directory looks like this:
.neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
.neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
.neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
.zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
.zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
.zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
.neon/refs/branches/mybranch
.neon/refs/tags/foo
.neon/refs/tags/bar
.zenith/refs/branches/mybranch
.zenith/refs/tags/foo
.zenith/refs/tags/bar
.neon/datadirs/<timeline uuid>
.zenith/datadirs/<timeline uuid>
### Timelines
@@ -39,7 +39,7 @@ All WAL is generated on a timeline. You can launch a read-only node
against a tag or arbitrary LSN on a timeline, but in order to write,
you need to create a timeline.
Each timeline is stored in a directory under .neon/timelines. It
Each timeline is stored in a directory under .zenith/timelines. It
consists of a WAL archive, containing all the WAL in the standard
PostgreSQL format, under the wal/ subdirectory.
@@ -66,18 +66,18 @@ contains the UUID of the timeline (and LSN, for tags).
### Datadirs
.neon/datadirs contains PostgreSQL data directories. You can launch
.zenith/datadirs contains PostgreSQL data directories. You can launch
a Postgres instance on one of them with:
```
postgres -D .neon/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
```
All the actual data is kept in the timeline directories, under
.neon/timelines. The data directories are only needed for active
.zenith/timelines. The data directories are only needed for active
PostgreQSL instances. After an instance is stopped, the data directory
can be safely removed. "neon start" will recreate it quickly from
the data in .neon/timelines, if it's missing.
can be safely removed. "zenith start" will recreate it quickly from
the data in .zenith/timelines, if it's missing.
## Version 2
@@ -103,14 +103,14 @@ more advanced. The exact format is TODO. But it should support:
### Garbage collection
When you run "neon gc", old timelines that are no longer needed are
When you run "zenith gc", old timelines that are no longer needed are
removed. That involves collecting the list of "unreachable" objects,
starting from the named branches and tags.
Also, if enough WAL has been generated on a timeline since last
snapshot, a new snapshot or delta is created.
### neon push/pull
### zenith push/pull
Compare the tags and branches on both servers, and copy missing ones.
For each branch, compare the timeline it points to in both servers. If
@@ -123,7 +123,7 @@ every time you start up an instance? Then you would detect that the
timelines have diverged. That would match with the "epoch" concept
that we have in the WAL safekeeper
### neon checkout/commit
### zenith checkout/commit
In this format, there is no concept of a "working tree", and hence no
concept of checking out or committing. All modifications are done on
@@ -134,7 +134,7 @@ You can easily fork off a temporary timeline to emulate a "working tree".
You can later remove it and have it garbage collected, or to "commit",
re-point the branch to the new timeline.
If we want to have a worktree and "neon checkout/commit" concept, we can
If we want to have a worktree and "zenith checkout/commit" concept, we can
emulate that with a temporary timeline. Create the temporary timeline at
"neon checkout", and have "neon commit" modify the branch to point to
"zenith checkout", and have "zenith commit" modify the branch to point to
the new timeline.

View File

@@ -4,27 +4,27 @@ How it works now
1. Create repository, start page server on it
```
$ neon init
$ zenith init
...
created main branch
new neon repository was created in .neon
new zenith repository was created in .zenith
$ neon pageserver start
Starting pageserver at '127.0.0.1:64000' in .neon
$ zenith pageserver start
Starting pageserver at '127.0.0.1:64000' in .zenith
Page server started
```
2. Create a branch, and start a Postgres instance on it
```
$ neon branch heikki main
$ zenith branch heikki main
branching at end of WAL: 0/15ECF68
$ neon pg create heikki
$ zenith pg create heikki
Initializing Postgres on timeline 76cf9279915be7797095241638e64644...
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/pg1 port=55432
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432
$ neon pg start pg1
$ zenith pg start pg1
Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki'
waiting for server to start.... done
server started
@@ -52,20 +52,20 @@ serverless on your laptop, so that the workflow becomes just:
1. Create repository, start page server on it (same as before)
```
$ neon init
$ zenith init
...
created main branch
new neon repository was created in .neon
new zenith repository was created in .zenith
$ neon pageserver start
Starting pageserver at '127.0.0.1:64000' in .neon
$ zenith pageserver start
Starting pageserver at '127.0.0.1:64000' in .zenith
Page server started
```
2. Create branch
```
$ neon branch heikki main
$ zenith branch heikki main
branching at end of WAL: 0/15ECF68
```

View File

@@ -7,22 +7,22 @@ Here is a proposal about implementing push/pull mechanics between pageservers. W
The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that).
```
neon origin add <name> <connection_uri>
neon origin list
neon origin remove <name>
zenith origin add <name> <connection_uri>
zenith origin list
zenith origin remove <name>
```
Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport.
Behind the scenes, this commands may update toml file inside .neon directory.
Behind the scenes, this commands may update toml file inside .zenith directory.
## Push
### Pushing branch
```
neon push mybranch cloudserver # push to eponymous branch in cloudserver
neon push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
zenith push mybranch cloudserver # push to eponymous branch in cloudserver
zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
```
Exact mechanics would be slightly different in the following situations:

View File

@@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well
We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files.
Even if neon aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to neon.
Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith.
So here is an attempt to design consistent CLI for different usage scenarios:
@@ -16,8 +16,8 @@ Save`storage_dest` and other parameters in config.
Push snapshots to `storage_dest` in background.
```
neon init --storage_dest=S3_PREFIX
neon start
zenith init --storage_dest=S3_PREFIX
zenith start
```
#### 2. Restart pageserver (manually or crash-recovery).
@@ -25,7 +25,7 @@ Take `storage_dest` from pageserver config, start pageserver from latest snapsho
Push snapshots to `storage_dest` in background.
```
neon start
zenith start
```
#### 3. Import.
@@ -35,22 +35,22 @@ Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time
Save`storage_dest` parameters in config.
Push snapshots to `storage_dest` in background.
```
//I.e. we want to start neon on top of existing $PGDATA and use s3 as a persistent storage.
neon init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
neon start
//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
zenith start
```
How to pass credentials needed for `snapshot_path`?
#### 4. Export.
Manually push snapshot to `snapshot_path` which differs from `storage_dest`
Optionally set `snapshot_format`, which can be plain pgdata format or neon format.
Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
```
neon export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
```
#### Notes and questions
- safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
- Why do we need `neon init` as a separate command? Can't we init everything at first start?
- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
- We can think of better names for all options.
- Export to plain postgres format will be useless, if we are not 100% compatible on page level.
I can recall at least one such difference - PD_WAL_LOGGED flag in pages.

View File

@@ -9,7 +9,7 @@ receival and this might lag behind `term`; safekeeper switches to epoch `n` when
it has received all committed log records from all `< n` terms. This roughly
corresponds to proposed in
https://github.com/neondatabase/rfcs/pull/3/files
https://github.com/zenithdb/rfcs/pull/3/files
This makes our biggest our difference from Raft. In Raft, every log record is

View File

@@ -1,6 +1,6 @@
# Safekeeper gossip
Extracted from this [PR](https://github.com/neondatabase/rfcs/pull/13)
Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13)
## Motivation

View File

@@ -2,7 +2,7 @@
Created on 19.01.22
Initially created [here](https://github.com/neondatabase/rfcs/pull/16) by @kelvich.
Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich.
That it is an alternative to (014-safekeeper-gossip)[]
@@ -292,4 +292,4 @@ But with an etcd we are in a bit different situation:
1. We don't need persistency and strong consistency guarantees for the data we store in the etcd
2. etcd uses Grpc as a protocol, and messages are pretty simple
So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local neon installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).
So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).

View File

@@ -52,10 +52,6 @@ pub enum ComputeStatus {
// compute will exit soon or is waiting for
// control-plane to terminate it.
Failed,
// Termination requested
TerminationPending,
// Terminated Postgres
Terminated,
}
fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>

View File

@@ -79,12 +79,6 @@ pub struct ComputeSpec {
// Stripe size for pageserver sharding, in pages
#[serde(default)]
pub shard_stripe_size: Option<usize>,
// When we are starting a new replica in hot standby mode,
// we need to know if the primary is running.
// This is used to determine if replica should wait for
// RUNNING_XACTS from primary or not.
pub primary_is_running: Option<bool>,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.

View File

@@ -201,11 +201,6 @@ impl<P: Atomic> GenericCounterPairVec<P> {
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
self.get_metric_with_label_values(vals).unwrap()
}
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
res[0] = self.inc.remove_label_values(vals);
res[1] = self.dec.remove_label_values(vals);
}
}
impl<P: Atomic> GenericCounterPair<P> {
@@ -252,15 +247,6 @@ impl<P: Atomic> GenericCounterPair<P> {
}
}
impl<P: Atomic> Clone for GenericCounterPair<P> {
fn clone(&self) -> Self {
Self {
inc: self.inc.clone(),
dec: self.dec.clone(),
}
}
}
/// Guard returned by [`GenericCounterPair::guard`]
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);

View File

@@ -18,11 +18,9 @@ enum-map.workspace = true
strum.workspace = true
strum_macros.workspace = true
hex.workspace = true
humantime.workspace = true
thiserror.workspace = true
humantime-serde.workspace = true
chrono.workspace = true
itertools.workspace = true
workspace_hack.workspace = true

View File

@@ -1,129 +0,0 @@
use std::str::FromStr;
/// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server
/// in [`attachment_service::http`]
use serde::{Deserialize, Serialize};
use utils::id::NodeId;
use crate::{models::ShardParameters, shard::TenantShardId};
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub generation: u32,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponse {
pub shards: Vec<TenantCreateResponseShard>,
}
#[derive(Serialize, Deserialize)]
pub struct NodeRegisterRequest {
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_http_addr: String,
pub listen_http_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct NodeConfigureRequest {
pub node_id: NodeId,
pub availability: Option<NodeAvailability>,
pub scheduling: Option<NodeSchedulingPolicy>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantLocateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_http_addr: String,
pub listen_http_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct TenantLocateResponse {
pub shards: Vec<TenantLocateResponseShard>,
pub shard_params: ShardParameters,
}
/// Explicitly migrating a particular shard is a low level operation
/// TODO: higher level "Reschedule tenant" operation where the request
/// specifies some constraints, e.g. asking it to get off particular node(s)
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: NodeId,
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
pub enum NodeAvailability {
// Normal, happy state
Active,
// Offline: Tenants shouldn't try to attach here, but they may assume that their
// secondary locations on this node still exist. Newly added nodes are in this
// state until we successfully contact them.
Offline,
}
impl FromStr for NodeAvailability {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self::Active),
"offline" => Ok(Self::Offline),
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
}
}
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
/// type needs to be defined with diesel traits in there.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
pub enum NodeSchedulingPolicy {
Active,
Filling,
Pause,
Draining,
}
impl FromStr for NodeSchedulingPolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self::Active),
"filling" => Ok(Self::Filling),
"pause" => Ok(Self::Pause),
"draining" => Ok(Self::Draining),
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
}
}
}
impl From<NodeSchedulingPolicy> for String {
fn from(value: NodeSchedulingPolicy) -> String {
use NodeSchedulingPolicy::*;
match value {
Active => "active",
Filling => "filling",
Pause => "pause",
Draining => "draining",
}
.to_string()
}
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateResponse {}

View File

@@ -2,7 +2,6 @@ use postgres_ffi::BLCKSZ;
use std::ops::Range;
use crate::key::Key;
use itertools::Itertools;
///
/// Represents a set of Keys, in a compact form.
@@ -64,36 +63,9 @@ impl KeySpace {
KeyPartitioning { parts }
}
/// Merge another keyspace into the current one.
/// Note: the keyspaces must not ovelap (enforced via assertions)
pub fn merge(&mut self, other: &KeySpace) {
let all_ranges = self
.ranges
.iter()
.merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start);
let mut accum = KeySpaceAccum::new();
let mut prev: Option<&Range<Key>> = None;
for range in all_ranges {
if let Some(prev) = prev {
let overlap =
std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
assert!(
!overlap,
"Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
prev, range
);
}
accum.add_range(range.clone());
prev = Some(range);
}
self.ranges = accum.to_keyspace().ranges;
}
/// Remove all keys in `other` from `self`.
/// This can involve splitting or removing of existing ranges.
/// Update the keyspace such that it doesn't contain any range
/// that is overlapping with `other`. This can involve splitting or
/// removing of existing ranges.
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
let (self_start, self_end) = match (self.start(), self.end()) {
(Some(start), Some(end)) => (start, end),
@@ -248,7 +220,16 @@ impl KeySpaceAccum {
}
pub fn consume_keyspace(&mut self) -> KeySpace {
std::mem::take(self).to_keyspace()
if let Some(accum) = self.accum.take() {
self.ranges.push(accum);
}
let mut prev_accum = KeySpaceAccum::new();
std::mem::swap(self, &mut prev_accum);
KeySpace {
ranges: prev_accum.ranges,
}
}
pub fn size(&self) -> u64 {
@@ -298,16 +279,8 @@ impl KeySpaceRandomAccum {
}
KeySpace { ranges }
}
pub fn consume_keyspace(&mut self) -> KeySpace {
let mut prev_accum = KeySpaceRandomAccum::new();
std::mem::swap(self, &mut prev_accum);
prev_accum.to_keyspace()
}
}
#[inline(always)]
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
let start = key_range.start;
let end = key_range.end;

View File

@@ -2,14 +2,13 @@
#![deny(clippy::undocumented_unsafe_blocks)]
use const_format::formatcp;
pub mod controller_api;
/// Public API types
pub mod control_api;
pub mod key;
pub mod keyspace;
pub mod models;
pub mod reltag;
pub mod shard;
/// Public API types
pub mod upcall_api;
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");

View File

@@ -1,7 +1,4 @@
pub mod partitioning;
pub mod utilization;
pub use utilization::PageserverUtilization;
use std::{
collections::HashMap,
@@ -14,6 +11,7 @@ use byteorder::{BigEndian, ReadBytesExt};
use postgres_ffi::BLCKSZ;
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use strum_macros;
use utils::{
completion,
history_buffer::HistoryBufferWithDropCounter,
@@ -182,7 +180,7 @@ pub enum TimelineState {
Broken { reason: String, backtrace: String },
}
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
pub new_timeline_id: TimelineId,
#[serde(default)]
@@ -271,8 +269,6 @@ pub struct TenantConfig {
pub compaction_target_size: Option<u64>,
pub compaction_period: Option<String>,
pub compaction_threshold: Option<usize>,
// defer parsing compaction_algorithm, like eviction_policy
pub compaction_algorithm: Option<CompactionAlgorithm>,
pub gc_horizon: Option<u64>,
pub gc_period: Option<String>,
pub image_creation_threshold: Option<usize>,
@@ -284,6 +280,7 @@ pub struct TenantConfig {
pub eviction_policy: Option<EvictionPolicy>,
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
pub heatmap_period: Option<String>,
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
@@ -294,7 +291,6 @@ pub struct TenantConfig {
pub enum EvictionPolicy {
NoEviction,
LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
OnlyImitiate(EvictionPolicyLayerAccessThreshold),
}
impl EvictionPolicy {
@@ -302,18 +298,10 @@ impl EvictionPolicy {
match self {
EvictionPolicy::NoEviction => "NoEviction",
EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "kind")]
pub enum CompactionAlgorithm {
Legacy,
Tiered,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct EvictionPolicyLayerAccessThreshold {
#[serde(with = "humantime_serde")]
@@ -347,14 +335,14 @@ impl ThrottleConfig {
}
/// The requests per second allowed by the given config.
pub fn steady_rps(&self) -> f64 {
(self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
(self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
}
}
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
/// lists out all possible states (and the virtual "Detached" state)
/// in a flat form rather than using rust-style enums.
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
pub enum LocationConfigMode {
AttachedSingle,
AttachedMulti,
@@ -418,12 +406,6 @@ pub struct TenantLocationConfigRequest {
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantTimeTravelRequest {
pub shard_counts: Vec<ShardCount>,
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantShardLocation {
@@ -435,8 +417,6 @@ pub struct TenantShardLocation {
#[serde(deny_unknown_fields)]
pub struct TenantLocationConfigResponse {
pub shards: Vec<TenantShardLocation>,
// If the shards' ShardCount count is >1, stripe_size will be set.
pub stripe_size: Option<ShardStripeSize>,
}
#[derive(Serialize, Deserialize, Debug)]
@@ -1078,6 +1058,7 @@ impl PagestreamBeMessage {
#[cfg(test)]
mod tests {
use bytes::Buf;
use serde_json::json;
use super::*;

View File

@@ -1,70 +0,0 @@
use std::time::SystemTime;
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
/// the next tenant.
///
/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
///
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
/// not handle full u64 values properly.
#[derive(serde::Serialize, Debug)]
pub struct PageserverUtilization {
/// Used disk space
#[serde(serialize_with = "ser_saturating_u63")]
pub disk_usage_bytes: u64,
/// Free disk space
#[serde(serialize_with = "ser_saturating_u63")]
pub free_space_bytes: u64,
/// Lower is better score for how good candidate for a next tenant would this pageserver be.
#[serde(serialize_with = "ser_saturating_u63")]
pub utilization_score: u64,
/// When was this snapshot captured, pageserver local time.
///
/// Use millis to give confidence that the value is regenerated often enough.
#[serde(serialize_with = "ser_rfc3339_millis")]
pub captured_at: SystemTime,
}
fn ser_rfc3339_millis<S: serde::Serializer>(
ts: &SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
///
/// Instead of newtype, use this because a newtype would get require handling deserializing values
/// with the highest bit set which is properly parsed by serde formats, but would create a
/// conundrum on how to handle and again serialize such values at type level. It will be a few
/// years until we can use more than `i64::MAX` bytes on a disk.
fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
let value = (*value).min(MAX_FORMAT_INT64);
serializer.serialize_u64(value)
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use super::*;
#[test]
fn u64_max_is_serialized_as_u63_max() {
let doc = PageserverUtilization {
disk_usage_bytes: u64::MAX,
free_space_bytes: 0,
utilization_score: u64::MAX,
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
};
let s = serde_json::to_string(&doc).unwrap();
let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
assert_eq!(s, expected);
}
}

View File

@@ -6,6 +6,7 @@ use crate::{
};
use hex::FromHex;
use serde::{Deserialize, Serialize};
use thiserror;
use utils::id::TenantId;
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
@@ -655,7 +656,10 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
#[cfg(test)]
mod tests {
use utils::Hex;
use std::str::FromStr;
use bincode;
use utils::{id::TenantId, Hex};
use super::*;

View File

@@ -6,6 +6,7 @@
#![deny(clippy::undocumented_unsafe_blocks)]
use anyhow::Context;
use bytes::Bytes;
use futures::pin_mut;
use serde::{Deserialize, Serialize};
use std::io::ErrorKind;
use std::net::SocketAddr;
@@ -377,7 +378,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
&mut self,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
let flush_fut = std::pin::pin!(self.flush());
let flush_fut = self.flush();
pin_mut!(flush_fut);
flush_fut.poll(cx)
}

View File

@@ -80,9 +80,6 @@ pub const XLOG_XACT_ABORT: u8 = 0x20;
pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
// From standbydefs.h
pub const XLOG_RUNNING_XACTS: u8 = 0x10;
// From srlu.h
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;

View File

@@ -119,6 +119,11 @@ pub fn generate_pg_control(
// Generate new pg_control needed for bootstrap
checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
//reset some fields we don't want to preserve
//TODO Check this.
//We may need to determine the value from twophase data.
checkpoint.oldestActiveXid = 0;
//save new values in pg_control
pg_control.checkPoint = 0;
pg_control.checkPointCopy = checkpoint;

View File

@@ -44,26 +44,6 @@ impl DownloadError {
}
}
impl From<std::io::Error> for DownloadError {
fn from(value: std::io::Error) -> Self {
let needs_unwrap = value.kind() == std::io::ErrorKind::Other
&& value
.get_ref()
.and_then(|x| x.downcast_ref::<DownloadError>())
.is_some();
if needs_unwrap {
*value
.into_inner()
.expect("just checked")
.downcast::<DownloadError>()
.expect("just checked")
} else {
DownloadError::Other(value.into())
}
}
}
#[derive(Debug)]
pub enum TimeTravelError {
/// Validation or other error happened due to user input.
@@ -162,12 +142,13 @@ impl std::fmt::Display for TimeoutOrCancel {
impl std::error::Error for TimeoutOrCancel {}
impl TimeoutOrCancel {
pub fn caused(error: &anyhow::Error) -> Option<&Self> {
error.root_cause().downcast_ref()
}
/// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
error
.root_cause()
.downcast_ref::<Self>()
.is_some_and(Self::is_cancel)
Self::caused(error).is_some_and(Self::is_cancel)
}
pub fn is_cancel(&self) -> bool {

View File

@@ -623,7 +623,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
mod fs_tests {
use super::*;
use bytes::Bytes;
use camino_tempfile::tempdir;
use futures_util::Stream;
use std::{collections::HashMap, io::Write};
async fn read_and_check_metadata(

View File

@@ -1040,7 +1040,7 @@ mod tests {
Some("test/prefix/"),
Some("/test/prefix/"),
];
let expected_outputs = [
let expected_outputs = vec![
vec!["", "some/path", "some/path"],
vec!["/", "/some/path", "/some/path"],
vec![

View File

@@ -73,8 +73,6 @@ where
if !*this.hit {
if let Poll::Ready(e) = this.cancellation.poll(cx) {
*this.hit = true;
// most likely this will be a std::io::Error wrapping a DownloadError
let e = Err(std::io::Error::from(e));
return Poll::Ready(Some(e));
}
@@ -132,8 +130,6 @@ mod tests {
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
"{inner:?}"
);
let e = DownloadError::from(e);
assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
tokio::select! {
_ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
@@ -150,7 +146,7 @@ mod tests {
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
let mut stream = std::pin::pin!(stream);
// because the stream uses 120s timeout and we are paused, we advance to 120s right away.
// because the stream uses 120s timeout we are paused, we advance to 120s right away.
let first = stream.next();
let e = first.await.expect("there must be some").unwrap_err();
@@ -162,8 +158,6 @@ mod tests {
.is_some_and(|e| matches!(e, DownloadError::Timeout)),
"{inner:?}"
);
let e = DownloadError::from(e);
assert!(matches!(e, DownloadError::Timeout), "{e:?}");
cancel.cancel();

View File

@@ -1,6 +1,7 @@
// For details about authentication see docs/authentication.md
use arc_swap::ArcSwap;
use serde;
use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
use anyhow::Result;
@@ -31,8 +32,6 @@ pub enum Scope {
// The scope used by pageservers in upcalls to storage controller and cloud control plane
#[serde(rename = "generations_api")]
GenerationsApi,
// Allows access to control plane managment API and some storage controller endpoints.
Admin,
}
/// JWT payload. See docs/authentication.md for the format
@@ -205,11 +204,12 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
// "scope": "tenant",
// "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
// "iss": "neon.controlplane",
// "exp": 1709200879,
// "iat": 1678442479
// }
// ```
//
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
// Check it can be validated with the public key
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);

View File

@@ -4,9 +4,7 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
///
/// Can be cloned, moved and kept around in futures as "guard objects".
#[derive(Clone)]
pub struct Completion {
_token: TaskTrackerToken,
}
pub struct Completion(TaskTrackerToken);
/// Barrier will wait until all clones of [`Completion`] have been dropped.
#[derive(Clone)]
@@ -51,5 +49,5 @@ pub fn channel() -> (Completion, Barrier) {
tracker.close();
let token = tracker.token();
(Completion { _token: token }, Barrier(tracker))
(Completion(token), Barrier(tracker))
}

View File

@@ -1,7 +1,7 @@
use std::{
borrow::Cow,
fs::{self, File},
io::{self, Write},
io,
};
use camino::{Utf8Path, Utf8PathBuf};
@@ -161,48 +161,6 @@ pub async fn durable_rename(
Ok(())
}
/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
///
/// The file is first written to the specified `tmp_path`, and in a second
/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
/// and atomic rename guarantee that, if we crash at any point, there will never
/// be a partially written file at `final_path` (but maybe at `tmp_path`).
///
/// Callers are responsible for serializing calls of this function for a given `final_path`.
/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
/// be no error and the content of `final_path` will be the "winner" caller's `content`.
/// I.e., the atomticity guarantees still hold.
pub fn overwrite(
final_path: &Utf8Path,
tmp_path: &Utf8Path,
content: &[u8],
) -> std::io::Result<()> {
let Some(final_path_parent) = final_path.parent() else {
return Err(std::io::Error::from_raw_os_error(
nix::errno::Errno::EINVAL as i32,
));
};
std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
let mut file = std::fs::OpenOptions::new()
.write(true)
// Use `create_new` so that, if we race with ourselves or something else,
// we bail out instead of causing damage.
.create_new(true)
.open(tmp_path)?;
file.write_all(content)?;
file.sync_all()?;
drop(file); // don't keep the fd open for longer than we have to
std::fs::rename(tmp_path, final_path)?;
let final_parent_dirfd = std::fs::OpenOptions::new()
.read(true)
.open(final_path_parent)?;
final_parent_dirfd.sync_all()?;
Ok(())
}
#[cfg(test)]
mod tests {

View File

@@ -45,7 +45,7 @@ impl Generation {
Self::Broken
}
pub const fn new(v: u32) -> Self {
pub fn new(v: u32) -> Self {
Self::Valid(v)
}

View File

@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tracing::{debug, info, info_span, warn, Instrument};
use tracing::{self, debug, info, info_span, warn, Instrument};
use std::future::Future;
use std::str::FromStr;
@@ -156,10 +156,6 @@ pub struct ChannelWriter {
buffer: BytesMut,
pub tx: mpsc::Sender<std::io::Result<Bytes>>,
written: usize,
/// Time spent waiting for the channel to make progress. It is not the same as time to upload a
/// buffer because we cannot know anything about that, but this should allow us to understand
/// the actual time taken without the time spent `std::thread::park`ed.
wait_time: std::time::Duration,
}
impl ChannelWriter {
@@ -172,7 +168,6 @@ impl ChannelWriter {
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
tx,
written: 0,
wait_time: std::time::Duration::ZERO,
}
}
@@ -185,8 +180,6 @@ impl ChannelWriter {
tracing::trace!(n, "flushing");
let ready = self.buffer.split().freeze();
let wait_started_at = std::time::Instant::now();
// not ideal to call from blocking code to block_on, but we are sure that this
// operation does not spawn_blocking other tasks
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
@@ -199,9 +192,6 @@ impl ChannelWriter {
// sending it to the client.
Ok(())
});
self.wait_time += wait_started_at.elapsed();
if res.is_err() {
return Err(std::io::ErrorKind::BrokenPipe.into());
}
@@ -212,10 +202,6 @@ impl ChannelWriter {
pub fn flushed_bytes(&self) -> usize {
self.written
}
pub fn wait_time(&self) -> std::time::Duration {
self.wait_time
}
}
impl std::io::Write for ChannelWriter {
@@ -266,52 +252,22 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
let span = info_span!("blocking");
tokio::task::spawn_blocking(move || {
// there are situations where we lose scraped metrics under load, try to gather some clues
// since all nodes are queried this, keep the message count low.
let spawned_at = std::time::Instant::now();
let _span = span.entered();
let metrics = metrics::gather();
let gathered_at = std::time::Instant::now();
let res = encoder
.encode(&metrics, &mut writer)
.and_then(|_| writer.flush().map_err(|e| e.into()));
// this instant is not when we finally got the full response sent, sending is done by hyper
// in another task.
let encoded_at = std::time::Instant::now();
let spawned_in = spawned_at - started_at;
let collected_in = gathered_at - spawned_at;
// remove the wait time here in case the tcp connection was clogged
let encoded_in = encoded_at - gathered_at - writer.wait_time();
let total = encoded_at - started_at;
match res {
Ok(()) => {
tracing::info!(
bytes = writer.flushed_bytes(),
total_ms = total.as_millis(),
spawning_ms = spawned_in.as_millis(),
collection_ms = collected_in.as_millis(),
encoding_ms = encoded_in.as_millis(),
elapsed_ms = started_at.elapsed().as_millis(),
"responded /metrics"
);
}
Err(e) => {
// there is a chance that this error is not the BrokenPipe we generate in the writer
// for "closed connection", but it is highly unlikely.
tracing::warn!(
after_bytes = writer.flushed_bytes(),
total_ms = total.as_millis(),
spawning_ms = spawned_in.as_millis(),
collection_ms = collected_in.as_millis(),
encoding_ms = encoded_in.as_millis(),
"failed to write out /metrics response: {e:?}"
);
tracing::warn!("failed to write out /metrics response: {e:#}");
// semantics of this error are quite... unclear. we want to error the stream out to
// abort the response to somehow notify the client that we failed.
//

View File

@@ -415,6 +415,7 @@ mod tests {
use super::*;
use serde::ser::Serialize;
use serde_assert::{Deserializer, Serializer, Token, Tokens};
#[test]

View File

@@ -1,6 +1,6 @@
#![warn(missing_docs)]
use std::cmp::{Eq, Ordering};
use std::cmp::{Eq, Ordering, PartialOrd};
use std::collections::BinaryHeap;
use std::fmt::Debug;
use std::mem;
@@ -249,6 +249,7 @@ where
mod tests {
use super::*;
use std::sync::Arc;
use std::time::Duration;
impl MonotonicCounter<i32> for i32 {
fn cnt_advance(&mut self, val: i32) {

View File

@@ -221,7 +221,7 @@ impl RcuWaitList {
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Mutex;
use std::sync::{Arc, Mutex};
use std::time::Duration;
#[tokio::test]

View File

@@ -239,6 +239,7 @@ mod tests {
use std::{
convert::Infallible,
pin::{pin, Pin},
sync::atomic::{AtomicUsize, Ordering},
time::Duration,
};

View File

@@ -73,7 +73,6 @@ url.workspace = true
walkdir.workspace = true
metrics.workspace = true
pageserver_api.workspace = true
pageserver_compaction.workspace = true
postgres_connection.workspace = true
postgres_ffi.workspace = true
pq_proto.workspace = true

View File

@@ -217,20 +217,6 @@ impl Client {
}
}
pub async fn tenant_time_travel_remote_storage(
&self,
tenant_shard_id: TenantShardId,
timestamp: &str,
done_if_after: &str,
) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}",
self.mgmt_api_endpoint
);
self.request(Method::PUT, &uri, ()).await?;
Ok(())
}
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
self.request(Method::PUT, &uri, req).await?;
@@ -251,30 +237,21 @@ impl Client {
tenant_shard_id: TenantShardId,
config: LocationConfig,
flush_ms: Option<std::time::Duration>,
lazy: bool,
) -> Result<()> {
let req_body = TenantLocationConfigRequest {
tenant_id: tenant_shard_id,
config,
};
let mut path = reqwest::Url::parse(&format!(
let path = format!(
"{}/v1/tenant/{}/location_config",
self.mgmt_api_endpoint, tenant_shard_id
))
// Should always work: mgmt_api_endpoint is configuration, not user input.
.expect("Cannot build URL");
if lazy {
path.query_pairs_mut().append_pair("lazy", "true");
}
if let Some(flush_ms) = flush_ms {
path.query_pairs_mut()
.append_pair("flush_ms", &format!("{}", flush_ms.as_millis()));
}
self.request(Method::PUT, path, &req_body).await?;
);
let path = if let Some(flush_ms) = flush_ms {
format!("{}?flush_ms={}", path, flush_ms.as_millis())
} else {
path
};
self.request(Method::PUT, &path, &req_body).await?;
Ok(())
}

View File

@@ -1,54 +0,0 @@
[package]
name = "pageserver_compaction"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
[dependencies]
anyhow.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
byteorder.workspace = true
bytes.workspace = true
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["string"] }
const_format.workspace = true
consumption_metrics.workspace = true
crossbeam-utils.workspace = true
either.workspace = true
flate2.workspace = true
fail.workspace = true
futures.workspace = true
git-version.workspace = true
hex.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
itertools.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
pin-project-lite.workspace = true
rand.workspace = true
smallvec = { workspace = true, features = ["write"] }
svg_fmt.workspace = true
sync_wrapper.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
tokio-io-timeout.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber.workspace = true
url.workspace = true
walkdir.workspace = true
metrics.workspace = true
utils.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
criterion.workspace = true
hex-literal.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

View File

@@ -1,51 +0,0 @@
# TODO
- If the key space can be perfectly partitioned at some key, perform planning on each
partition separately. For example, if we are compacting a level with layers like this:
```
:
+--+ +----+ : +------+
| | | | : | |
+--+ +----+ : +------+
:
+-----+ +-+ : +--------+
| | | | : | |
+-----+ +-+ : +--------+
:
```
At the dotted line, there is a natural split in the key space, such that all
layers are either on the left or the right of it. We can compact the
partitions separately. We could choose to create image layers for one
partition but not the other one, for example.
- All the layers don't have to be exactly the same size, we can choose to cut a
layer short or stretch it a little larger than the target size, if it helps
the overall system. We can help perfect partitions (see previous bullet point)
to happen more frequently, by choosing the cut points wisely. For example, try
to cut layers at boundaries of underlying image layers. And "snap to grid",
i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
- Avoid rewriting layers when we'd just create an identical layer to an input
layer.
- Parallelism. The code is already split up into planning and execution, so that
we first split up the compaction work into "Jobs", and then execute them.
It would be straightforward to execute multiple jobs in parallel.
- Materialize extra pages in delta layers during compaction. This would reduce
read amplification. There has been the idea of partial image layers. Materializing
extra pages in the delta layers achieve the same goal, without introducing a new
concept.
## Simulator
- Expand the simulator for more workloads
- Automate a test suite that runs the simluator with different workloads and
spits out a table of results
- Model read amplification
- More sanity checking. One idea is to keep a reference count of each
MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
a MockRecord that is newer than PITR horizon is completely dropped. That would
indicate that the record was lost.

Some files were not shown because too many files have changed in this diff Show More