mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-24 08:30:37 +00:00
Compare commits
73 Commits
vlad/get-v
...
skyzh/reve
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3da782f651 | ||
|
|
f84931798b | ||
|
|
5984edaecd | ||
|
|
3eb83a0ebb | ||
|
|
4d426f6fbe | ||
|
|
d04af08567 | ||
|
|
54586d6b57 | ||
|
|
e5384ebefc | ||
|
|
60a232400b | ||
|
|
edd809747b | ||
|
|
48957e23b7 | ||
|
|
1d5e476c96 | ||
|
|
2b11466b59 | ||
|
|
b6bd75964f | ||
|
|
fcb77f3d8f | ||
|
|
c3a40a06f3 | ||
|
|
1b1320a263 | ||
|
|
e1b4d96b5b | ||
|
|
a8ec18c0f4 | ||
|
|
045bc6af8b | ||
|
|
c8ac4c054e | ||
|
|
896d51367e | ||
|
|
a691786ce2 | ||
|
|
2991d01b61 | ||
|
|
e895644555 | ||
|
|
62d77e263f | ||
|
|
b2bbc20311 | ||
|
|
5accf6e24a | ||
|
|
0881d4f9e3 | ||
|
|
975786265c | ||
|
|
c4059939e6 | ||
|
|
75baf83fce | ||
|
|
459c2af8c1 | ||
|
|
51a43b121c | ||
|
|
256058f2ab | ||
|
|
ceedc3ef73 | ||
|
|
5273c94c59 | ||
|
|
dedf66ba5b | ||
|
|
8283779ee8 | ||
|
|
b8f9e3a9eb | ||
|
|
ec3efc56a8 | ||
|
|
94f6b488ed | ||
|
|
a12e4261a3 | ||
|
|
cd449d66ea | ||
|
|
6f8f7c7de9 | ||
|
|
12487e662d | ||
|
|
5bcae3a86e | ||
|
|
47657f2df4 | ||
|
|
d669dacd71 | ||
|
|
837988b6c9 | ||
|
|
9c6145f0a9 | ||
|
|
2424d90883 | ||
|
|
cf3baf6039 | ||
|
|
9c48b5c4ab | ||
|
|
c671aeacd4 | ||
|
|
bc7a82caf2 | ||
|
|
b5246753bf | ||
|
|
c1095f4c52 | ||
|
|
1718c0b59b | ||
|
|
8107ae8377 | ||
|
|
555ee9fdd0 | ||
|
|
6921577cec | ||
|
|
20fff05699 | ||
|
|
f2767d2056 | ||
|
|
76b92e3389 | ||
|
|
03f8a42ed9 | ||
|
|
60e5a56a5a | ||
|
|
afda4420bd | ||
|
|
ce1673a8c4 | ||
|
|
532b0fa52b | ||
|
|
4de2f0f3e0 | ||
|
|
41464325c7 | ||
|
|
7257ffbf75 |
@@ -39,7 +39,7 @@ runs:
|
||||
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${PR_NUMBER}" != "null" ]; then
|
||||
BRANCH_OR_PR=pr-${PR_NUMBER}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
|
||||
# Shortcut for special branches
|
||||
BRANCH_OR_PR=${GITHUB_REF_NAME}
|
||||
else
|
||||
|
||||
@@ -19,7 +19,7 @@ runs:
|
||||
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${PR_NUMBER}" != "null" ]; then
|
||||
BRANCH_OR_PR=pr-${PR_NUMBER}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
|
||||
# Shortcut for special branches
|
||||
BRANCH_OR_PR=${GITHUB_REF_NAME}
|
||||
else
|
||||
|
||||
8
.github/workflows/actionlint.yml
vendored
8
.github/workflows/actionlint.yml
vendored
@@ -16,8 +16,14 @@ concurrency:
|
||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
jobs:
|
||||
actionlint:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name}}
|
||||
|
||||
actionlint:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
10
.github/workflows/benchmarking.yml
vendored
10
.github/workflows/benchmarking.yml
vendored
@@ -62,7 +62,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -214,7 +214,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
@@ -362,7 +362,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -461,7 +461,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -558,7 +558,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
|
||||
105
.github/workflows/build-build-tools-image.yml
vendored
Normal file
105
.github/workflows/build-build-tools-image.yml
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
name: Build build-tools image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
image-tag:
|
||||
description: "build-tools image tag"
|
||||
required: true
|
||||
type: string
|
||||
outputs:
|
||||
image-tag:
|
||||
description: "build-tools tag"
|
||||
value: ${{ inputs.image-tag }}
|
||||
image:
|
||||
description: "build-tools image"
|
||||
value: neondatabase/build-tools:${{ inputs.image-tag }}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
group: build-build-tools-image-${{ inputs.image-tag }}
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
check-image:
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
# This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
|
||||
build-image:
|
||||
needs: [ check-image ]
|
||||
if: needs.check-image.outputs.found == 'false'
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
|
||||
|
||||
env:
|
||||
IMAGE_TAG: ${{ inputs.image-tag }}
|
||||
|
||||
steps:
|
||||
- name: Check `input.tag` is correct
|
||||
env:
|
||||
INPUTS_IMAGE_TAG: ${{ inputs.image-tag }}
|
||||
CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }}
|
||||
run: |
|
||||
if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then
|
||||
echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p /tmp/.docker-custom
|
||||
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/setup-buildx-action@v2
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.build-tools
|
||||
cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
|
||||
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
run: |
|
||||
rm -rf /tmp/.docker-custom
|
||||
|
||||
merge-images:
|
||||
needs: [ build-image ]
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
IMAGE_TAG: ${{ inputs.image-tag }}
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Create multi-arch image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
|
||||
neondatabase/build-tools:${IMAGE_TAG}-x64 \
|
||||
neondatabase/build-tools:${IMAGE_TAG}-arm64
|
||||
124
.github/workflows/build_and_push_docker_image.yml
vendored
124
.github/workflows/build_and_push_docker_image.yml
vendored
@@ -1,124 +0,0 @@
|
||||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
dockerfile-path:
|
||||
required: true
|
||||
type: string
|
||||
image-name:
|
||||
required: true
|
||||
type: string
|
||||
outputs:
|
||||
build-tools-tag:
|
||||
description: "tag generated for build tools"
|
||||
value: ${{ jobs.tag.outputs.build-tools-tag }}
|
||||
|
||||
jobs:
|
||||
check-if-build-tools-dockerfile-changed:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
|
||||
steps:
|
||||
- name: Check if Dockerfile.buildtools has changed
|
||||
id: dockerfile
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
|
||||
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
|
||||
exit
|
||||
fi
|
||||
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
|
||||
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
|
||||
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
tag:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ check-if-build-tools-dockerfile-changed ]
|
||||
outputs:
|
||||
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
|
||||
|
||||
steps:
|
||||
- name: Get buildtools tag
|
||||
env:
|
||||
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
|
||||
IMAGE_TAG=$GITHUB_RUN_ID
|
||||
else
|
||||
IMAGE_TAG=pinned
|
||||
fi
|
||||
|
||||
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
|
||||
shell: bash
|
||||
id: buildtools-tag
|
||||
|
||||
kaniko:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: |
|
||||
/kaniko/executor \
|
||||
--reproducible \
|
||||
--snapshotMode=redo \
|
||||
--skip-unused-stages \
|
||||
--dockerfile ${{ inputs.dockerfile-path }} \
|
||||
--cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
||||
|
||||
kaniko-arm:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: |
|
||||
/kaniko/executor \
|
||||
--reproducible \
|
||||
--snapshotMode=redo \
|
||||
--skip-unused-stages \
|
||||
--dockerfile ${{ inputs.dockerfile-path }} \
|
||||
--cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
manifest:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
name: 'manifest'
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs:
|
||||
- tag
|
||||
- kaniko
|
||||
- kaniko-arm
|
||||
- check-if-build-tools-dockerfile-changed
|
||||
|
||||
steps:
|
||||
- name: Create manifest
|
||||
run: |
|
||||
docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
|
||||
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
|
||||
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
- name: Push manifest
|
||||
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
|
||||
363
.github/workflows/build_and_test.yml
vendored
363
.github/workflows/build_and_test.yml
vendored
@@ -5,6 +5,7 @@ on:
|
||||
branches:
|
||||
- main
|
||||
- release
|
||||
- release-proxy
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
@@ -27,24 +28,9 @@ env:
|
||||
jobs:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Disallow PRs from forks
|
||||
if: |
|
||||
github.event_name == 'pull_request' &&
|
||||
github.event.pull_request.head.repo.full_name != github.repository
|
||||
|
||||
run: |
|
||||
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
|
||||
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
|
||||
else
|
||||
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
|
||||
fi
|
||||
|
||||
echo >&2 "We don't run CI for PRs from forks"
|
||||
echo >&2 "${MESSAGE}"
|
||||
|
||||
exit 1
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name}}
|
||||
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
@@ -82,6 +68,8 @@ jobs:
|
||||
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
|
||||
@@ -89,19 +77,25 @@ jobs:
|
||||
shell: bash
|
||||
id: build-tag
|
||||
|
||||
build-buildtools-image:
|
||||
check-build-tools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/build_and_push_docker_image.yml
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
build-build-tools-image:
|
||||
needs: [ check-build-tools-image ]
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
with:
|
||||
dockerfile-path: Dockerfile.buildtools
|
||||
image-name: build-tools
|
||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -130,10 +124,13 @@ jobs:
|
||||
run: poetry run mypy .
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -197,10 +194,13 @@ jobs:
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
|
||||
build-neon:
|
||||
needs: [ check-permissions, tag, build-buildtools-image ]
|
||||
needs: [ check-permissions, tag, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# Raise locked memory limit for tokio-epoll-uring.
|
||||
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
|
||||
# io_uring will account the memory of the CQ and SQ as locked.
|
||||
@@ -438,10 +438,13 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
@@ -485,10 +488,13 @@ jobs:
|
||||
get-benchmarks-durations:
|
||||
outputs:
|
||||
json: ${{ steps.get-benchmark-durations.outputs.json }}
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
steps:
|
||||
@@ -515,10 +521,13 @@ jobs:
|
||||
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
|
||||
|
||||
benchmarks:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
|
||||
needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
@@ -550,12 +559,15 @@ jobs:
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -596,10 +608,13 @@ jobs:
|
||||
})
|
||||
|
||||
coverage-report:
|
||||
needs: [ check-permissions, regress-tests, build-buildtools-image ]
|
||||
needs: [ check-permissions, regress-tests, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -697,166 +712,146 @@ jobs:
|
||||
})
|
||||
|
||||
trigger-e2e-tests:
|
||||
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
|
||||
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
|
||||
needs: [ check-permissions, promote-images, tag ]
|
||||
uses: ./.github/workflows/trigger-e2e-tests.yml
|
||||
secrets: inherit
|
||||
|
||||
neon-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure ECR and Docker Hub login
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Kaniko build neon
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
- uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
|
||||
- name: Configure ECR and Docker Hub login
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
- name: Kaniko build compute tools
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
rm -rf .docker-custom
|
||||
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
# Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
|
||||
# Should be prevented by https://github.com/neondatabase/neon/issues/4281
|
||||
options: --add-host=download.osgeo.org:140.211.15.30
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure ECR and Docker Hub login
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
# Disable parallelism for docker buildkit.
|
||||
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
|
||||
config-inline: |
|
||||
[worker.oci]
|
||||
max-parallelism = 1
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Kaniko build compute node with extensions
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--cleanup
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
- name: Build compute-node image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
if: ${{ matrix.version == 'v16' }}
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
target: compute-tools-image
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ check-permissions, tag, compute-node-image ]
|
||||
@@ -900,7 +895,7 @@ jobs:
|
||||
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
test-images:
|
||||
needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
|
||||
needs: [ check-permissions, tag, neon-image, compute-node-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
|
||||
steps:
|
||||
@@ -934,7 +929,8 @@ jobs:
|
||||
fi
|
||||
|
||||
- name: Verify docker-compose example
|
||||
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
|
||||
timeout-minutes: 20
|
||||
run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
|
||||
|
||||
- name: Print logs and clean up
|
||||
if: always()
|
||||
@@ -967,9 +963,7 @@ jobs:
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
@@ -981,9 +975,7 @@ jobs:
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Push images to production ECR
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
@@ -1007,9 +999,7 @@ jobs:
|
||||
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push latest tags to Docker Hub
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
@@ -1099,7 +1089,7 @@ jobs:
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
|
||||
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
@@ -1134,14 +1124,28 @@ jobs:
|
||||
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
-f deployStorageBroker=false \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create git tag
|
||||
if: github.ref_name == 'release'
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
@@ -1154,6 +1158,7 @@ jobs:
|
||||
sha: context.sha,
|
||||
})
|
||||
|
||||
# TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
|
||||
- name: Create GitHub release
|
||||
if: github.ref_name == 'release'
|
||||
uses: actions/github-script@v7
|
||||
@@ -1205,3 +1210,11 @@ jobs:
|
||||
|
||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
|
||||
done
|
||||
|
||||
pin-build-tools-image:
|
||||
needs: [ build-build-tools-image, promote-images, regress-tests ]
|
||||
if: github.ref_name == 'main'
|
||||
uses: ./.github/workflows/pin-build-tools-image.yml
|
||||
with:
|
||||
from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
58
.github/workflows/check-build-tools-image.yml
vendored
Normal file
58
.github/workflows/check-build-tools-image.yml
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
name: Check build-tools image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
outputs:
|
||||
image-tag:
|
||||
description: "build-tools image tag"
|
||||
value: ${{ jobs.check-image.outputs.tag }}
|
||||
found:
|
||||
description: "Whether the image is found in the registry"
|
||||
value: ${{ jobs.check-image.outputs.found }}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
check-image:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
|
||||
found: ${{ steps.check-image.outputs.found }}
|
||||
|
||||
steps:
|
||||
- name: Get build-tools image tag for the current commit
|
||||
id: get-build-tools-tag
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
LAST_BUILD_TOOLS_SHA=$(
|
||||
gh api \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
--method GET \
|
||||
--field path=Dockerfile.build-tools \
|
||||
--field sha=${COMMIT_SHA} \
|
||||
--field per_page=1 \
|
||||
--jq ".[0].sha" \
|
||||
"/repos/${GITHUB_REPOSITORY}/commits"
|
||||
)
|
||||
echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- name: Check if such tag found in the registry
|
||||
id: check-image
|
||||
env:
|
||||
IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
|
||||
run: |
|
||||
if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
|
||||
found=true
|
||||
else
|
||||
found=false
|
||||
fi
|
||||
|
||||
echo "found=${found}" | tee -a $GITHUB_OUTPUT
|
||||
36
.github/workflows/check-permissions.yml
vendored
Normal file
36
.github/workflows/check-permissions.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Check Permissions
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
github-event-name:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Disallow CI runs on PRs from forks
|
||||
if: |
|
||||
inputs.github-event-name == 'pull_request' &&
|
||||
github.event.pull_request.head.repo.full_name != github.repository
|
||||
run: |
|
||||
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
|
||||
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
|
||||
else
|
||||
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
|
||||
fi
|
||||
|
||||
# TODO: use actions/github-script to post this message as a PR comment
|
||||
echo >&2 "We don't run CI for PRs from forks"
|
||||
echo >&2 "${MESSAGE}"
|
||||
|
||||
exit 1
|
||||
32
.github/workflows/cleanup-caches-by-a-branch.yml
vendored
Normal file
32
.github/workflows/cleanup-caches-by-a-branch.yml
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
# A workflow from
|
||||
# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries
|
||||
|
||||
name: cleanup caches by a branch
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- closed
|
||||
|
||||
jobs:
|
||||
cleanup:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Cleanup
|
||||
run: |
|
||||
gh extension install actions/gh-actions-cache
|
||||
|
||||
echo "Fetching list of cache key"
|
||||
cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )
|
||||
|
||||
## Setting this to not fail the workflow while deleting cache keys.
|
||||
set +e
|
||||
echo "Deleting caches..."
|
||||
for cacheKey in $cacheKeysForPR
|
||||
do
|
||||
gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
|
||||
done
|
||||
echo "Done"
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
REPO: ${{ github.repository }}
|
||||
BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge
|
||||
38
.github/workflows/neon_extra_builds.yml
vendored
38
.github/workflows/neon_extra_builds.yml
vendored
@@ -20,7 +20,25 @@ env:
|
||||
COPT: '-Werror'
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name}}
|
||||
|
||||
check-build-tools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
build-build-tools-image:
|
||||
needs: [ check-build-tools-image ]
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
with:
|
||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
check-macos-build:
|
||||
needs: [ check-permissions ]
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
@@ -116,8 +134,8 @@ jobs:
|
||||
run: ./run_clippy.sh
|
||||
|
||||
check-linux-arm-build:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
timeout-minutes: 90
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
|
||||
env:
|
||||
@@ -130,7 +148,10 @@ jobs:
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -237,12 +258,15 @@ jobs:
|
||||
cargo nextest run --package remote_storage --test test_real_azure
|
||||
|
||||
check-codestyle-rust-arm:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
timeout-minutes: 90
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -309,13 +333,17 @@ jobs:
|
||||
run: cargo deny check
|
||||
|
||||
gather-rust-build-stats:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
env:
|
||||
|
||||
72
.github/workflows/pin-build-tools-image.yml
vendored
Normal file
72
.github/workflows/pin-build-tools-image.yml
vendored
Normal file
@@ -0,0 +1,72 @@
|
||||
name: 'Pin build-tools image'
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
workflow_call:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
group: pin-build-tools-image-${{ inputs.from-tag }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: pinned
|
||||
|
||||
steps:
|
||||
- name: Check if we really need to pin the image
|
||||
id: check-manifests
|
||||
run: |
|
||||
docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
|
||||
docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json
|
||||
|
||||
if diff ${FROM_TAG}.json ${TO_TAG}.json; then
|
||||
skip=true
|
||||
else
|
||||
skip=false
|
||||
fi
|
||||
|
||||
echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
|
||||
neondatabase/build-tools:${FROM_TAG}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
|
||||
neondatabase/build-tools:${FROM_TAG}
|
||||
83
.github/workflows/release.yml
vendored
83
.github/workflows/release.yml
vendored
@@ -2,12 +2,31 @@ name: Create Release Branch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * 1'
|
||||
# It should be kept in sync with if-condition in jobs
|
||||
- cron: '0 6 * * MON' # Storage release
|
||||
- cron: '0 6 * * THU' # Proxy release
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
create-storage-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Storage release PR'
|
||||
required: false
|
||||
create-proxy-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Proxy release PR'
|
||||
required: false
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
jobs:
|
||||
create_release_branch:
|
||||
runs-on: [ ubuntu-latest ]
|
||||
create-storage-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
@@ -18,27 +37,67 @@ jobs:
|
||||
with:
|
||||
ref: main
|
||||
|
||||
- name: Get current date
|
||||
id: date
|
||||
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
|
||||
- name: Set environment variables
|
||||
run: |
|
||||
echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
|
||||
- name: Create release branch
|
||||
run: git checkout -b releases/${{ steps.date.outputs.date }}
|
||||
run: git checkout -b $RELEASE_BRANCH
|
||||
|
||||
- name: Push new branch
|
||||
run: git push origin releases/${{ steps.date.outputs.date }}
|
||||
run: git push origin $RELEASE_BRANCH
|
||||
|
||||
- name: Create pull request into release
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
## Release ${{ steps.date.outputs.date }}
|
||||
## Release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this PR using 'Create a merge commit'!**
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
gh pr create --title "Release ${{ steps.date.outputs.date }}" \
|
||||
gh pr create --title "Release ${RELEASE_DATE}" \
|
||||
--body-file "body.md" \
|
||||
--head "releases/${{ steps.date.outputs.date }}" \
|
||||
--head "${RELEASE_BRANCH}" \
|
||||
--base "release"
|
||||
|
||||
create-proxy-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
|
||||
- name: Set environment variables
|
||||
run: |
|
||||
echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
|
||||
- name: Create release branch
|
||||
run: git checkout -b $RELEASE_BRANCH
|
||||
|
||||
- name: Push new branch
|
||||
run: git push origin $RELEASE_BRANCH
|
||||
|
||||
- name: Create pull request into release
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
## Proxy release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
gh pr create --title "Proxy release ${RELEASE_DATE}}" \
|
||||
--body-file "body.md" \
|
||||
--head "${RELEASE_BRANCH}" \
|
||||
--base "release-proxy"
|
||||
|
||||
2
.github/workflows/trigger-e2e-tests.yml
vendored
2
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -51,6 +51,8 @@ jobs:
|
||||
echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
|
||||
|
||||
70
.github/workflows/update_build_tools_image.yml
vendored
70
.github/workflows/update_build_tools_image.yml
vendored
@@ -1,70 +0,0 @@
|
||||
name: 'Update build tools image tag'
|
||||
|
||||
# This workflow it used to update tag of build tools in ECR.
|
||||
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
to-tag:
|
||||
description: 'Destination tag'
|
||||
required: true
|
||||
type: string
|
||||
default: 'pinned'
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
|
||||
env:
|
||||
ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
|
||||
steps:
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.21'
|
||||
|
||||
- name: Install crane
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
|
||||
|
||||
- name: Copy images
|
||||
run: |
|
||||
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
|
||||
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -9,6 +9,7 @@ test_output/
|
||||
neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
compaction-suite-results.*
|
||||
|
||||
# Coverage
|
||||
*.profraw
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/control_plane/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
|
||||
/control_plane/attachment_service @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/proxy/ @neondatabase/proxy
|
||||
|
||||
@@ -74,16 +74,11 @@ We're using the following approach to make it work:
|
||||
|
||||
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
|
||||
|
||||
## How do I add the "pinned" tag to an buildtools image?
|
||||
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
|
||||
## How do I make build-tools image "pinned"
|
||||
|
||||
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
|
||||
or using GitHub CLI:
|
||||
It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow.
|
||||
|
||||
```bash
|
||||
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
|
||||
-f from-tag=6254913013 \
|
||||
-f to-tag=pinned \
|
||||
|
||||
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
|
||||
```
|
||||
gh workflow -R neondatabase/neon run pin-build-tools-image.yml \
|
||||
-f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e
|
||||
```
|
||||
|
||||
50
Cargo.lock
generated
50
Cargo.lock
generated
@@ -284,6 +284,7 @@ dependencies = [
|
||||
"diesel_migrations",
|
||||
"futures",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
@@ -3497,6 +3498,7 @@ dependencies = [
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_compaction",
|
||||
"pin-project-lite",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
@@ -3551,6 +3553,7 @@ dependencies = [
|
||||
"const_format",
|
||||
"enum-map",
|
||||
"hex",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools",
|
||||
"postgres_ffi",
|
||||
@@ -3586,6 +3589,53 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"const_format",
|
||||
"consumption_metrics",
|
||||
"criterion",
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
"fail",
|
||||
"flate2",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pin-project-lite",
|
||||
"rand 0.8.5",
|
||||
"smallvec",
|
||||
"svg_fmt",
|
||||
"sync_wrapper",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"utils",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking"
|
||||
version = "2.1.1"
|
||||
|
||||
@@ -5,6 +5,7 @@ members = [
|
||||
"control_plane",
|
||||
"control_plane/attachment_service",
|
||||
"pageserver",
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/pagebench",
|
||||
@@ -199,6 +200,7 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
|
||||
@@ -786,6 +786,22 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_partman"
|
||||
# compile pg_partman extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-partman-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
|
||||
echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -829,6 +845,7 @@ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -874,7 +891,17 @@ ENV BUILD_TAG=$BUILD_TAG
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||
RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final compute-tools image
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM debian:bullseye-slim AS compute-tools-image
|
||||
|
||||
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
|
||||
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
|
||||
# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
|
||||
ARG RUSTC_WRAPPER=cachepot
|
||||
ENV AWS_REGION=eu-central-1
|
||||
ENV CACHEPOT_S3_KEY_PREFIX=cachepot
|
||||
ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
#ARG AWS_ACCESS_KEY_ID
|
||||
#ARG AWS_SECRET_ACCESS_KEY
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN set -e \
|
||||
&& mold -run cargo build -p compute_tools --locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Final image that only has one binary
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl
|
||||
12
README.md
12
README.md
@@ -5,7 +5,7 @@
|
||||
Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
|
||||
## Quick start
|
||||
Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
|
||||
Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
|
||||
|
||||
Alternatively, compile and run the project [locally](#running-local-installation).
|
||||
|
||||
@@ -230,6 +230,10 @@ postgres=# select * from t;
|
||||
> cargo neon stop
|
||||
```
|
||||
|
||||
#### Handling build failures
|
||||
|
||||
If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
|
||||
|
||||
## Running tests
|
||||
|
||||
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
|
||||
@@ -259,6 +263,12 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
|
||||
> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
|
||||
> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
|
||||
|
||||
## Cleanup
|
||||
|
||||
For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
|
||||
|
||||
For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned!
|
||||
|
||||
## Documentation
|
||||
|
||||
[docs](/docs) Contains a top-level overview of all available markdown documentation.
|
||||
|
||||
@@ -82,6 +82,12 @@ pub fn write_postgres_conf(
|
||||
ComputeMode::Replica => {
|
||||
// hot_standby is 'on' by default, but let's be explicit
|
||||
writeln!(file, "hot_standby=on")?;
|
||||
|
||||
// Inform the replica about the primary state
|
||||
// Default is 'false'
|
||||
if let Some(primary_is_running) = spec.primary_is_running {
|
||||
writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -655,6 +655,9 @@ pub fn handle_grants(
|
||||
// remove this code if possible. The worst thing that could happen is that
|
||||
// user won't be able to use public schema in NEW databases created in the
|
||||
// very OLD project.
|
||||
//
|
||||
// Also, alter default permissions so that relations created by extensions can be
|
||||
// used by neon_superuser without permission issues.
|
||||
let grant_query = "DO $$\n\
|
||||
BEGIN\n\
|
||||
IF EXISTS(\n\
|
||||
@@ -673,6 +676,15 @@ pub fn handle_grants(
|
||||
GRANT CREATE ON SCHEMA public TO web_access;\n\
|
||||
END IF;\n\
|
||||
END IF;\n\
|
||||
IF EXISTS(\n\
|
||||
SELECT nspname\n\
|
||||
FROM pg_catalog.pg_namespace\n\
|
||||
WHERE nspname = 'public'\n\
|
||||
)\n\
|
||||
THEN\n\
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
|
||||
END IF;\n\
|
||||
END\n\
|
||||
$$;"
|
||||
.to_string();
|
||||
@@ -777,6 +789,12 @@ BEGIN
|
||||
END
|
||||
$$;"#,
|
||||
"GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
|
||||
// Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
@@ -803,8 +821,13 @@ $$;"#,
|
||||
client.simple_query(query)?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
info!("Running migration:\n{}\n", migrations[current_migration]);
|
||||
client.simple_query(migrations[current_migration])?;
|
||||
let migration = &migrations[current_migration];
|
||||
if migration.is_empty() {
|
||||
info!("Skip migration id={}", current_migration);
|
||||
} else {
|
||||
info!("Running migration:\n{}\n", migration);
|
||||
client.simple_query(migration)?;
|
||||
}
|
||||
current_migration += 1;
|
||||
}
|
||||
let setval = format!(
|
||||
|
||||
@@ -18,6 +18,7 @@ clap.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
|
||||
9
control_plane/attachment_service/src/auth.rs
Normal file
9
control_plane/attachment_service/src/auth.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
use utils::auth::{AuthError, Claims, Scope};
|
||||
|
||||
pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
|
||||
if claims.scope != required_scope {
|
||||
return Err(AuthError("Scope mismatch. Permission denied".into()));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -4,15 +4,15 @@ use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{
|
||||
TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TimelineCreateRequest,
|
||||
TenantTimeTravelRequest, TimelineCreateRequest,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use utils::auth::SwappableJwtAuth;
|
||||
use utils::http::endpoint::{auth_middleware, request_span};
|
||||
use utils::http::request::parse_request_param;
|
||||
use utils::auth::{Scope, SwappableJwtAuth};
|
||||
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
|
||||
use utils::http::request::{must_get_query_param, parse_request_param};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::{
|
||||
@@ -25,12 +25,12 @@ use utils::{
|
||||
id::NodeId,
|
||||
};
|
||||
|
||||
use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::attachment_service::{
|
||||
AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
|
||||
TenantShardMigrateRequest,
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
};
|
||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
@@ -64,21 +64,18 @@ fn get_state(request: &Request<Body>) -> &HttpState {
|
||||
|
||||
/// Pageserver calls into this on startup, to learn which tenants it should attach
|
||||
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::GenerationsApi)?;
|
||||
|
||||
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.re_attach(reattach_req)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?,
|
||||
)
|
||||
json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
|
||||
}
|
||||
|
||||
/// Pageserver calls into this before doing deletions, to confirm that it still
|
||||
/// holds the latest generation for the tenants with deletions enqueued
|
||||
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::GenerationsApi)?;
|
||||
|
||||
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.validate(validate_req))
|
||||
@@ -88,6 +85,8 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
|
||||
/// (in the real control plane this is unnecessary, because the same program is managing
|
||||
/// generation numbers and doing attachments).
|
||||
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
@@ -102,6 +101,8 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
}
|
||||
|
||||
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req);
|
||||
@@ -113,6 +114,8 @@ async fn handle_tenant_create(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
@@ -171,6 +174,8 @@ async fn handle_tenant_location_config(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
@@ -180,11 +185,47 @@ async fn handle_tenant_location_config(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_time_travel_remote_storage(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
|
||||
|
||||
let timestamp_raw = must_get_query_param(&req, "travel_to")?;
|
||||
let _timestamp = humantime::parse_rfc3339(×tamp_raw).map_err(|_e| {
|
||||
ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Invalid time for travel_to: {timestamp_raw:?}"
|
||||
))
|
||||
})?;
|
||||
|
||||
let done_if_after_raw = must_get_query_param(&req, "done_if_after")?;
|
||||
let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| {
|
||||
ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Invalid time for done_if_after: {done_if_after_raw:?}"
|
||||
))
|
||||
})?;
|
||||
|
||||
service
|
||||
.tenant_time_travel_remote_storage(
|
||||
&time_travel_req,
|
||||
tenant_id,
|
||||
timestamp_raw,
|
||||
done_if_after_raw,
|
||||
)
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_delete(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
service.tenant_delete(tenant_id).await
|
||||
@@ -197,6 +238,8 @@ async fn handle_tenant_timeline_create(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
@@ -211,6 +254,8 @@ async fn handle_tenant_timeline_delete(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
@@ -224,6 +269,7 @@ async fn handle_tenant_timeline_passthrough(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let Some(path) = req.uri().path_and_query() else {
|
||||
// This should never happen, our request router only calls us if there is a path
|
||||
@@ -267,11 +313,15 @@ async fn handle_tenant_locate(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
state.service.node_register(register_req).await?;
|
||||
@@ -279,17 +329,23 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
}
|
||||
|
||||
async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.node_list().await?)
|
||||
}
|
||||
|
||||
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
|
||||
}
|
||||
|
||||
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
|
||||
if node_id != config_req.node_id {
|
||||
@@ -299,13 +355,18 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
}
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.node_configure(config_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_split(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
|
||||
|
||||
@@ -319,6 +380,8 @@ async fn handle_tenant_shard_migrate(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
@@ -331,22 +394,30 @@ async fn handle_tenant_shard_migrate(
|
||||
|
||||
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
|
||||
}
|
||||
|
||||
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
state.service.tenants_dump()
|
||||
}
|
||||
|
||||
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
state.service.scheduler_dump()
|
||||
}
|
||||
|
||||
async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.consistency_check().await?)
|
||||
@@ -403,6 +474,12 @@ where
|
||||
.await
|
||||
}
|
||||
|
||||
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
|
||||
check_permission_with(request, |claims| {
|
||||
crate::auth::check_permission(claims, required_scope)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn make_router(
|
||||
service: Arc<Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
@@ -477,6 +554,9 @@ pub fn make_router(
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
|
||||
})
|
||||
// Timeline operations
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::seqwait::MonotonicCounter;
|
||||
|
||||
mod auth;
|
||||
mod compute_hook;
|
||||
pub mod http;
|
||||
pub mod metrics;
|
||||
|
||||
@@ -79,13 +79,38 @@ impl Secrets {
|
||||
"neon-storage-controller-control-plane-jwt-token";
|
||||
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
|
||||
|
||||
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
|
||||
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
|
||||
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
|
||||
const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
|
||||
|
||||
/// Load secrets from, in order of preference:
|
||||
/// - CLI args if database URL is provided on the CLI
|
||||
/// - Environment variables if DATABASE_URL is set.
|
||||
/// - AWS Secrets Manager secrets
|
||||
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
||||
match &args.database_url {
|
||||
Some(url) => Self::load_cli(url, args),
|
||||
None => Self::load_aws_sm().await,
|
||||
None => match std::env::var(Self::DATABASE_URL_ENV) {
|
||||
Ok(database_url) => Self::load_env(database_url),
|
||||
Err(_) => Self::load_aws_sm().await,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn load_env(database_url: String) -> anyhow::Result<Self> {
|
||||
let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
|
||||
Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
|
||||
Err(_) => None,
|
||||
};
|
||||
Ok(Self {
|
||||
database_url,
|
||||
public_key,
|
||||
jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
|
||||
control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn load_aws_sm() -> anyhow::Result<Self> {
|
||||
let Ok(region) = std::env::var("AWS_REGION") else {
|
||||
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use serde::Serialize;
|
||||
use utils::id::NodeId;
|
||||
|
||||
@@ -10,7 +10,7 @@ use crate::persistence::NodePersistence;
|
||||
///
|
||||
/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
|
||||
/// implementation of serialization on this type is only for debug dumps.
|
||||
#[derive(Clone, Serialize, Eq, PartialEq)]
|
||||
#[derive(Clone, Serialize)]
|
||||
pub(crate) struct Node {
|
||||
pub(crate) id: NodeId,
|
||||
|
||||
|
||||
@@ -6,10 +6,10 @@ use std::time::Duration;
|
||||
use self::split_state::SplitState;
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::controller_api::NodeSchedulingPolicy;
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -130,24 +130,10 @@ impl Persistence {
|
||||
}
|
||||
|
||||
/// At startup, populate the list of nodes which our shards may be placed on
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
|
||||
let nodes: Vec<Node> = self
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
|
||||
let nodes: Vec<NodePersistence> = self
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::nodes::table
|
||||
.load::<NodePersistence>(conn)?
|
||||
.into_iter()
|
||||
.map(|n| Node {
|
||||
id: NodeId(n.node_id as u64),
|
||||
// At startup we consider a node offline until proven otherwise.
|
||||
availability: NodeAvailability::Offline,
|
||||
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
|
||||
.expect("Bad scheduling policy in DB"),
|
||||
listen_http_addr: n.listen_http_addr,
|
||||
listen_http_port: n.listen_http_port as u16,
|
||||
listen_pg_addr: n.listen_pg_addr,
|
||||
listen_pg_port: n.listen_pg_port as u16,
|
||||
})
|
||||
.collect::<Vec<Node>>())
|
||||
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
||||
})
|
||||
.await?;
|
||||
|
||||
@@ -156,6 +142,31 @@ impl Persistence {
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
pub(crate) async fn update_node(
|
||||
&self,
|
||||
input_node_id: NodeId,
|
||||
input_scheduling: NodeSchedulingPolicy,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::nodes::dsl::*;
|
||||
let updated = self
|
||||
.with_conn(move |conn| {
|
||||
let updated = diesel::update(nodes)
|
||||
.filter(node_id.eq(input_node_id.0 as i64))
|
||||
.set((scheduling_policy.eq(String::from(input_scheduling)),))
|
||||
.execute(conn)?;
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
if updated != 1 {
|
||||
Err(DatabaseError::Logical(format!(
|
||||
"Node {node_id:?} not found for update",
|
||||
)))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// At startup, load the high level state for shards, such as their config + policy. This will
|
||||
/// be enriched at runtime with state discovered on pageservers.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
@@ -506,7 +517,7 @@ pub(crate) struct TenantShardPersistence {
|
||||
}
|
||||
|
||||
/// Parts of [`crate::node::Node`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::nodes)]
|
||||
pub(crate) struct NodePersistence {
|
||||
pub(crate) node_id: i64,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::persistence::Persistence;
|
||||
use crate::service;
|
||||
use control_plane::attachment_service::NodeAvailability;
|
||||
use pageserver_api::controller_api::NodeAvailability;
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
@@ -438,7 +438,7 @@ impl Reconciler {
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
tracing::info!("Observed configuration already correct.")
|
||||
tracing::info!(%node_id, "Observed configuration already correct.")
|
||||
}
|
||||
_ => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
@@ -449,7 +449,7 @@ impl Reconciler {
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
wanted_conf.generation = self.generation.into();
|
||||
tracing::info!("Observed configuration requires update.");
|
||||
tracing::info!(%node_id, "Observed configuration requires update.");
|
||||
self.location_config(node_id, wanted_conf, None).await?;
|
||||
self.compute_notify().await?;
|
||||
}
|
||||
|
||||
@@ -175,10 +175,34 @@ impl Scheduler {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_shard(
|
||||
&mut self,
|
||||
hard_exclude: &[NodeId],
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
/// Where we have several nodes to choose from, for example when picking a secondary location
|
||||
/// to promote to an attached location, this method may be used to pick the best choice based
|
||||
/// on the scheduler's knowledge of utilization and availability.
|
||||
///
|
||||
/// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
|
||||
/// caller can pick a node some other way.
|
||||
pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
|
||||
if nodes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let node = nodes
|
||||
.iter()
|
||||
.map(|node_id| {
|
||||
let may_schedule = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.may_schedule)
|
||||
.unwrap_or(false);
|
||||
(*node_id, may_schedule)
|
||||
})
|
||||
.max_by_key(|(_n, may_schedule)| *may_schedule);
|
||||
|
||||
// If even the preferred node has may_schedule==false, return None
|
||||
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
|
||||
if self.nodes.is_empty() {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
@@ -227,44 +251,45 @@ impl Scheduler {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test_utils {
|
||||
|
||||
use crate::node::Node;
|
||||
use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use std::collections::HashMap;
|
||||
use utils::id::NodeId;
|
||||
/// Test helper: synthesize the requested number of nodes, all in active state.
|
||||
///
|
||||
/// Node IDs start at one.
|
||||
pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
|
||||
(1..n + 1)
|
||||
.map(|i| {
|
||||
(
|
||||
NodeId(i),
|
||||
Node {
|
||||
id: NodeId(i),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
listen_http_addr: format!("httphost-{i}"),
|
||||
listen_http_port: 80 + i as u16,
|
||||
listen_pg_addr: format!("pghost-{i}"),
|
||||
listen_pg_port: 5432 + i as u16,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{node::Node, tenant_state::IntentState};
|
||||
|
||||
use crate::tenant_state::IntentState;
|
||||
#[test]
|
||||
fn scheduler_basic() -> anyhow::Result<()> {
|
||||
let mut nodes = HashMap::new();
|
||||
nodes.insert(
|
||||
NodeId(1),
|
||||
Node {
|
||||
id: NodeId(1),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
listen_http_addr: String::new(),
|
||||
listen_http_port: 0,
|
||||
listen_pg_addr: String::new(),
|
||||
listen_pg_port: 0,
|
||||
},
|
||||
);
|
||||
|
||||
nodes.insert(
|
||||
NodeId(2),
|
||||
Node {
|
||||
id: NodeId(2),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
listen_http_addr: String::new(),
|
||||
listen_http_port: 0,
|
||||
listen_pg_addr: String::new(),
|
||||
listen_pg_port: 0,
|
||||
},
|
||||
);
|
||||
let nodes = test_utils::make_test_nodes(2);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut t1_intent = IntentState::new();
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
cmp::Ordering,
|
||||
collections::{BTreeMap, HashMap, HashSet},
|
||||
str::FromStr,
|
||||
@@ -8,26 +9,28 @@ use std::{
|
||||
|
||||
use anyhow::Context;
|
||||
use control_plane::attachment_service::{
|
||||
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability,
|
||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse,
|
||||
TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
|
||||
};
|
||||
use diesel::result::DatabaseErrorKind;
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
|
||||
TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
|
||||
TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
use pageserver_api::{
|
||||
control_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
|
||||
ValidateResponse, ValidateResponseTenant,
|
||||
},
|
||||
models::{
|
||||
self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
|
||||
TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
|
||||
TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
|
||||
TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||
TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
|
||||
upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
|
||||
ValidateResponse, ValidateResponseTenant,
|
||||
},
|
||||
};
|
||||
use pageserver_client::mgmt_api;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -55,6 +58,11 @@ use crate::{
|
||||
PlacementPolicy, Sequence,
|
||||
};
|
||||
|
||||
// For operations that should be quick, like attaching a new tenant
|
||||
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
|
||||
// For operations that might be slow, like migrating a tenant with
|
||||
// some data in it.
|
||||
const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
/// How long [`Service::startup_reconcile`] is allowed to take before it should give
|
||||
@@ -478,8 +486,8 @@ impl Service {
|
||||
async move {
|
||||
if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
|
||||
tracing::error!(
|
||||
tenant_shard_id=%tenant_shard_id,
|
||||
node_id=%node_id,
|
||||
%tenant_shard_id,
|
||||
%node_id,
|
||||
"Failed to notify compute on startup for shard: {e}"
|
||||
);
|
||||
None
|
||||
@@ -616,7 +624,22 @@ impl Service {
|
||||
let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
tracing::info!("Loading nodes from database...");
|
||||
let nodes = persistence.list_nodes().await?;
|
||||
let nodes = persistence
|
||||
.list_nodes()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|n| Node {
|
||||
id: NodeId(n.node_id as u64),
|
||||
// At startup we consider a node offline until proven otherwise.
|
||||
availability: NodeAvailability::Offline,
|
||||
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
|
||||
.expect("Bad scheduling policy in DB"),
|
||||
listen_http_addr: n.listen_http_addr,
|
||||
listen_http_port: n.listen_http_port as u16,
|
||||
listen_pg_addr: n.listen_pg_addr,
|
||||
listen_pg_port: n.listen_pg_port as u16,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
|
||||
tracing::info!("Loaded {} nodes from database.", nodes.len());
|
||||
|
||||
@@ -908,7 +931,16 @@ impl Service {
|
||||
pub(crate) async fn re_attach(
|
||||
&self,
|
||||
reattach_req: ReAttachRequest,
|
||||
) -> anyhow::Result<ReAttachResponse> {
|
||||
) -> Result<ReAttachResponse, ApiError> {
|
||||
// Take a re-attach as indication that the node is available: this is a precursor to proper
|
||||
// heartbeating in https://github.com/neondatabase/neon/issues/6844
|
||||
self.node_configure(NodeConfigureRequest {
|
||||
node_id: reattach_req.node_id,
|
||||
availability: Some(NodeAvailability::Active),
|
||||
scheduling: None,
|
||||
})
|
||||
.await?;
|
||||
|
||||
// Ordering: we must persist generation number updates before making them visible in the in-memory state
|
||||
let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
|
||||
|
||||
@@ -999,6 +1031,16 @@ impl Service {
|
||||
&self,
|
||||
create_req: TenantCreateRequest,
|
||||
) -> Result<TenantCreateResponse, ApiError> {
|
||||
let (response, waiters) = self.do_tenant_create(create_req).await?;
|
||||
|
||||
self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub(crate) async fn do_tenant_create(
|
||||
&self,
|
||||
create_req: TenantCreateRequest,
|
||||
) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
|
||||
// This service expects to handle sharding itself: it is an error to try and directly create
|
||||
// a particular shard here.
|
||||
let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
|
||||
@@ -1148,11 +1190,12 @@ impl Service {
|
||||
(waiters, response_shards)
|
||||
};
|
||||
|
||||
self.await_waiters(waiters).await?;
|
||||
|
||||
Ok(TenantCreateResponse {
|
||||
shards: response_shards,
|
||||
})
|
||||
Ok((
|
||||
TenantCreateResponse {
|
||||
shards: response_shards,
|
||||
},
|
||||
waiters,
|
||||
))
|
||||
}
|
||||
|
||||
/// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
|
||||
@@ -1160,8 +1203,9 @@ impl Service {
|
||||
async fn await_waiters(
|
||||
&self,
|
||||
waiters: Vec<ReconcilerWaiter>,
|
||||
timeout: Duration,
|
||||
) -> Result<(), ReconcileWaitError> {
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
|
||||
let deadline = Instant::now().checked_add(timeout).unwrap();
|
||||
for waiter in waiters {
|
||||
let timeout = deadline.duration_since(Instant::now());
|
||||
waiter.wait_timeout(timeout).await?;
|
||||
@@ -1299,12 +1343,8 @@ impl Service {
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: if we timeout/fail on reconcile, we should still succeed this request,
|
||||
// because otherwise a broken compute hook causes a feedback loop where
|
||||
// location_config returns 500 and gets retried forever.
|
||||
|
||||
if let Some(create_req) = maybe_create {
|
||||
let create_resp = self.tenant_create(create_req).await?;
|
||||
let waiters = if let Some(create_req) = maybe_create {
|
||||
let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
|
||||
result.shards = create_resp
|
||||
.shards
|
||||
.into_iter()
|
||||
@@ -1313,20 +1353,115 @@ impl Service {
|
||||
shard_id: s.shard_id,
|
||||
})
|
||||
.collect();
|
||||
waiters
|
||||
} else {
|
||||
// This was an update, wait for reconciliation
|
||||
if let Err(e) = self.await_waiters(waiters).await {
|
||||
// Do not treat a reconcile error as fatal: we have already applied any requested
|
||||
// Intent changes, and the reconcile can fail for external reasons like unavailable
|
||||
// compute notification API. In these cases, it is important that we do not
|
||||
// cause the cloud control plane to retry forever on this API.
|
||||
tracing::warn!(
|
||||
"Failed to reconcile after /location_config: {e}, returning success anyway"
|
||||
);
|
||||
waiters
|
||||
};
|
||||
|
||||
if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
|
||||
// Do not treat a reconcile error as fatal: we have already applied any requested
|
||||
// Intent changes, and the reconcile can fail for external reasons like unavailable
|
||||
// compute notification API. In these cases, it is important that we do not
|
||||
// cause the cloud control plane to retry forever on this API.
|
||||
tracing::warn!(
|
||||
"Failed to reconcile after /location_config: {e}, returning success anyway"
|
||||
);
|
||||
}
|
||||
|
||||
// Logging the full result is useful because it lets us cross-check what the cloud control
|
||||
// plane's tenant_shards table should contain.
|
||||
tracing::info!("Complete, returning {result:?}");
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_time_travel_remote_storage(
|
||||
&self,
|
||||
time_travel_req: &TenantTimeTravelRequest,
|
||||
tenant_id: TenantId,
|
||||
timestamp: Cow<'_, str>,
|
||||
done_if_after: Cow<'_, str>,
|
||||
) -> Result<(), ApiError> {
|
||||
let node = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
// Just a sanity check to prevent misuse: the API expects that the tenant is fully
|
||||
// detached everywhere, and nothing writes to S3 storage. Here, we verify that,
|
||||
// but only at the start of the process, so it's really just to prevent operator
|
||||
// mistakes.
|
||||
for (shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
|
||||
if shard.intent.get_attached().is_some() || !shard.intent.get_secondary().is_empty()
|
||||
{
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"We want tenant to be attached in shard with tenant_shard_id={shard_id}"
|
||||
)));
|
||||
}
|
||||
let maybe_attached = shard
|
||||
.observed
|
||||
.locations
|
||||
.iter()
|
||||
.filter_map(|(node_id, observed_location)| {
|
||||
observed_location
|
||||
.conf
|
||||
.as_ref()
|
||||
.map(|loc| (node_id, observed_location, loc.mode))
|
||||
})
|
||||
.find(|(_, _, mode)| *mode != LocationConfigMode::Detached);
|
||||
if let Some((node_id, _observed_location, mode)) = maybe_attached {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}")));
|
||||
}
|
||||
}
|
||||
let scheduler = &locked.scheduler;
|
||||
// Right now we only perform the operation on a single node without parallelization
|
||||
// TODO fan out the operation to multiple nodes for better performance
|
||||
let node_id = scheduler.schedule_shard(&[])?;
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while lock is active");
|
||||
node.clone()
|
||||
};
|
||||
|
||||
// The shard count is encoded in the remote storage's URL, so we need to handle all historically used shard counts
|
||||
let mut counts = time_travel_req
|
||||
.shard_counts
|
||||
.iter()
|
||||
.copied()
|
||||
.collect::<HashSet<_>>()
|
||||
.into_iter()
|
||||
.collect::<Vec<_>>();
|
||||
counts.sort_unstable();
|
||||
|
||||
for count in counts {
|
||||
let shard_ids = (0..count.count())
|
||||
.map(|i| TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(i),
|
||||
shard_count: count,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
for tenant_shard_id in shard_ids {
|
||||
let client =
|
||||
mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
|
||||
tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
|
||||
|
||||
client
|
||||
.tenant_time_travel_remote_storage(
|
||||
tenant_shard_id,
|
||||
×tamp,
|
||||
&done_if_after,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
|
||||
node.id
|
||||
))
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
|
||||
@@ -2209,7 +2344,11 @@ impl Service {
|
||||
.context("Scheduler checks")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let expect_nodes = locked.nodes.values().cloned().collect::<Vec<_>>();
|
||||
let expect_nodes = locked
|
||||
.nodes
|
||||
.values()
|
||||
.map(|n| n.to_persistent())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let expect_shards = locked
|
||||
.tenants
|
||||
@@ -2221,8 +2360,8 @@ impl Service {
|
||||
};
|
||||
|
||||
let mut nodes = self.persistence.list_nodes().await?;
|
||||
expect_nodes.sort_by_key(|n| n.id);
|
||||
nodes.sort_by_key(|n| n.id);
|
||||
expect_nodes.sort_by_key(|n| n.node_id);
|
||||
nodes.sort_by_key(|n| n.node_id);
|
||||
|
||||
if nodes != expect_nodes {
|
||||
tracing::error!("Consistency check failed on nodes.");
|
||||
@@ -2236,6 +2375,9 @@ impl Service {
|
||||
serde_json::to_string(&nodes)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?
|
||||
);
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Node consistency failure"
|
||||
)));
|
||||
}
|
||||
|
||||
let mut shards = self.persistence.list_tenant_shards().await?;
|
||||
@@ -2246,14 +2388,17 @@ impl Service {
|
||||
tracing::error!("Consistency check failed on shards.");
|
||||
tracing::error!(
|
||||
"Shards in memory: {}",
|
||||
serde_json::to_string(&expect_nodes)
|
||||
serde_json::to_string(&expect_shards)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?
|
||||
);
|
||||
tracing::error!(
|
||||
"Shards in database: {}",
|
||||
serde_json::to_string(&nodes)
|
||||
serde_json::to_string(&shards)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?
|
||||
);
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Shard consistency failure"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -2379,7 +2524,18 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn node_configure(&self, config_req: NodeConfigureRequest) -> Result<(), ApiError> {
|
||||
pub(crate) async fn node_configure(
|
||||
&self,
|
||||
config_req: NodeConfigureRequest,
|
||||
) -> Result<(), ApiError> {
|
||||
if let Some(scheduling) = config_req.scheduling {
|
||||
// Scheduling is a persistent part of Node: we must write updates to the database before
|
||||
// applying them in memory
|
||||
self.persistence
|
||||
.update_node(config_req.node_id, scheduling)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use crate::{metrics, persistence::TenantShardPersistence};
|
||||
use control_plane::attachment_service::NodeAvailability;
|
||||
use pageserver_api::controller_api::NodeAvailability;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
@@ -143,6 +143,23 @@ impl IntentState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from
|
||||
/// secondary to attached while maintaining the scheduler's reference counts.
|
||||
pub(crate) fn promote_attached(
|
||||
&mut self,
|
||||
_scheduler: &mut Scheduler,
|
||||
promote_secondary: NodeId,
|
||||
) {
|
||||
// If we call this with a node that isn't in secondary, it would cause incorrect
|
||||
// scheduler reference counting, since we assume the node is already referenced as a secondary.
|
||||
debug_assert!(self.secondary.contains(&promote_secondary));
|
||||
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
self.secondary.retain(|n| n != &promote_secondary);
|
||||
self.attached = Some(promote_secondary);
|
||||
}
|
||||
|
||||
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
|
||||
debug_assert!(!self.secondary.contains(&new_secondary));
|
||||
scheduler.node_inc_ref(new_secondary);
|
||||
@@ -197,6 +214,8 @@ impl IntentState {
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
true
|
||||
@@ -370,6 +389,9 @@ impl TenantState {
|
||||
// All remaining observed locations generate secondary intents. This includes None
|
||||
// observations, as these may well have some local content on disk that is usable (this
|
||||
// is an edge case that might occur if we restarted during a migration or other change)
|
||||
//
|
||||
// We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`]
|
||||
// will take care of promoting one of these secondaries to be attached.
|
||||
self.observed.locations.keys().for_each(|node_id| {
|
||||
if Some(*node_id) != self.intent.attached {
|
||||
self.intent.secondary.push(*node_id);
|
||||
@@ -377,6 +399,33 @@ impl TenantState {
|
||||
});
|
||||
}
|
||||
|
||||
/// Part of [`Self::schedule`] that is used to choose exactly one node to act as the
|
||||
/// attached pageserver for a shard.
|
||||
///
|
||||
/// Returns whether we modified it, and the NodeId selected.
|
||||
fn schedule_attached(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
) -> Result<(bool, NodeId), ScheduleError> {
|
||||
// No work to do if we already have an attached tenant
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
return Ok((false, node_id));
|
||||
}
|
||||
|
||||
if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
|
||||
// Promote a secondary
|
||||
tracing::debug!("Promoted secondary {} to attached", promote_secondary);
|
||||
self.intent.promote_attached(scheduler, promote_secondary);
|
||||
Ok((true, promote_secondary))
|
||||
} else {
|
||||
// Pick a fresh node: either we had no secondaries or none were schedulable
|
||||
let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
|
||||
tracing::debug!("Selected {} as attached", node_id);
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
Ok((true, node_id))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
|
||||
// TODO: before scheduling new nodes, check if any existing content in
|
||||
// self.intent refers to pageservers that are offline, and pick other
|
||||
@@ -387,19 +436,15 @@ impl TenantState {
|
||||
|
||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
||||
// more work on the same pageservers we're already using.
|
||||
let mut used_pageservers = self.intent.all_pageservers();
|
||||
let mut modified = false;
|
||||
|
||||
use PlacementPolicy::*;
|
||||
match self.policy {
|
||||
Single => {
|
||||
// Should have exactly one attached, and zero secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
@@ -407,13 +452,10 @@ impl TenantState {
|
||||
}
|
||||
Double(secondary_count) => {
|
||||
// Should have exactly one attached, and N secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
let mut used_pageservers = vec![attached_node_id];
|
||||
while self.intent.secondary.len() < secondary_count {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
@@ -495,6 +537,13 @@ impl TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
for node_id in self.observed.locations.keys() {
|
||||
if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
|
||||
// We have observed state that isn't part of our intent: need to clean it up.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
|
||||
// wake up a reconciler to send it.
|
||||
if self.pending_compute_notification {
|
||||
@@ -688,10 +737,95 @@ impl TenantState {
|
||||
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
|
||||
shard_stripe_size: self.shard.stripe_size.0 as i32,
|
||||
generation: self.generation.into().unwrap_or(0) as i32,
|
||||
generation_pageserver: i64::MAX,
|
||||
generation_pageserver: self
|
||||
.intent
|
||||
.get_attached()
|
||||
.map(|n| n.0 as i64)
|
||||
.unwrap_or(i64::MAX),
|
||||
|
||||
placement_policy: serde_json::to_string(&self.policy).unwrap(),
|
||||
config: serde_json::to_string(&self.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::scheduler::test_utils::make_test_nodes;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
|
||||
let tenant_id = TenantId::generate();
|
||||
let shard_number = ShardNumber(0);
|
||||
let shard_count = ShardCount::new(1);
|
||||
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
};
|
||||
TenantState::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
policy,
|
||||
)
|
||||
}
|
||||
|
||||
/// Test the scheduling behaviors used when a tenant configured for HA is subject
|
||||
/// to nodes being marked offline.
|
||||
#[test]
|
||||
fn tenant_ha_scheduling() -> anyhow::Result<()> {
|
||||
// Start with three nodes. Our tenant will only use two. The third one is
|
||||
// expected to remain unused.
|
||||
let mut nodes = make_test_nodes(3);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
||||
tenant_state
|
||||
.schedule(&mut scheduler)
|
||||
.expect("we have enough nodes, scheduling should work");
|
||||
|
||||
// Expect to initially be schedule on to different nodes
|
||||
assert_eq!(tenant_state.intent.secondary.len(), 1);
|
||||
assert!(tenant_state.intent.attached.is_some());
|
||||
|
||||
let attached_node_id = tenant_state.intent.attached.unwrap();
|
||||
let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
|
||||
assert_ne!(attached_node_id, secondary_node_id);
|
||||
|
||||
// Notifying the attached node is offline should demote it to a secondary
|
||||
let changed = tenant_state.intent.notify_offline(attached_node_id);
|
||||
assert!(changed);
|
||||
|
||||
// Update the scheduler state to indicate the node is offline
|
||||
nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
|
||||
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
|
||||
|
||||
// Scheduling the node should promote the still-available secondary node to attached
|
||||
tenant_state
|
||||
.schedule(&mut scheduler)
|
||||
.expect("active nodes are available");
|
||||
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
|
||||
|
||||
// The original attached node should have been retained as a secondary
|
||||
assert_eq!(
|
||||
*tenant_state.intent.secondary.iter().last().unwrap(),
|
||||
attached_node_id
|
||||
);
|
||||
|
||||
tenant_state.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,8 +2,12 @@ use crate::{background_process, local_env::LocalEnv};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hyper::Method;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
},
|
||||
models::{
|
||||
ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
@@ -11,12 +15,12 @@ use pageserver_api::{
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::str::FromStr;
|
||||
use std::{fs, str::FromStr};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
|
||||
@@ -24,7 +28,7 @@ pub struct AttachmentService {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: Utf8PathBuf,
|
||||
jwt_token: Option<String>,
|
||||
private_key: Option<Vec<u8>>,
|
||||
public_key: Option<String>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
@@ -55,126 +59,6 @@ pub struct InspectResponse {
|
||||
pub attachment: Option<(u32, NodeId)>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
pub generation: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponse {
|
||||
pub shards: Vec<TenantCreateResponseShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeRegisterRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeConfigureRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub availability: Option<NodeAvailability>,
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantLocateResponse {
|
||||
pub shards: Vec<TenantLocateResponseShard>,
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||
// secondary locations on this node still exist. Newly added nodes are in this
|
||||
// state until we successfully contact them.
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
||||
/// type needs to be defined with diesel traits in there.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
Filling,
|
||||
Pause,
|
||||
Draining,
|
||||
}
|
||||
|
||||
impl FromStr for NodeSchedulingPolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"filling" => Ok(Self::Filling),
|
||||
"pause" => Ok(Self::Pause),
|
||||
"draining" => Ok(Self::Draining),
|
||||
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NodeSchedulingPolicy> for String {
|
||||
fn from(value: NodeSchedulingPolicy) -> String {
|
||||
use NodeSchedulingPolicy::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Filling => "filling",
|
||||
Pause => "pause",
|
||||
Draining => "draining",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
|
||||
impl AttachmentService {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
@@ -204,12 +88,11 @@ impl AttachmentService {
|
||||
.pageservers
|
||||
.first()
|
||||
.expect("Config is validated to contain at least one pageserver");
|
||||
let (jwt_token, public_key) = match ps_conf.http_auth_type {
|
||||
let (private_key, public_key) = match ps_conf.http_auth_type {
|
||||
AuthType::Trust => (None, None),
|
||||
AuthType::NeonJWT => {
|
||||
let jwt_token = env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
|
||||
.unwrap();
|
||||
let private_key_path = env.get_private_key_path();
|
||||
let private_key = fs::read(private_key_path).expect("failed to read private key");
|
||||
|
||||
// If pageserver auth is enabled, this implicitly enables auth for this service,
|
||||
// using the same credentials.
|
||||
@@ -235,7 +118,7 @@ impl AttachmentService {
|
||||
} else {
|
||||
std::fs::read_to_string(&public_key_path).expect("Can't read public key")
|
||||
};
|
||||
(Some(jwt_token), Some(public_key))
|
||||
(Some(private_key), Some(public_key))
|
||||
}
|
||||
};
|
||||
|
||||
@@ -243,7 +126,7 @@ impl AttachmentService {
|
||||
env: env.clone(),
|
||||
path,
|
||||
listen,
|
||||
jwt_token,
|
||||
private_key,
|
||||
public_key,
|
||||
postgres_port,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
@@ -397,7 +280,10 @@ impl AttachmentService {
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
if let Some(private_key) = &self.private_key {
|
||||
let claims = Claims::new(None, Scope::PageServerApi);
|
||||
let jwt_token =
|
||||
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
}
|
||||
|
||||
@@ -422,7 +308,7 @@ impl AttachmentService {
|
||||
)],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
|| async {
|
||||
match self.status().await {
|
||||
match self.ready().await {
|
||||
Ok(_) => Ok(true),
|
||||
Err(_) => Ok(false),
|
||||
}
|
||||
@@ -468,6 +354,20 @@ impl AttachmentService {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
|
||||
let category = match path.find('/') {
|
||||
Some(idx) => &path[..idx],
|
||||
None => path,
|
||||
};
|
||||
|
||||
match category {
|
||||
"status" | "ready" => Ok(None),
|
||||
"control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
|
||||
"v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
|
||||
_ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into attachment service
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
@@ -493,11 +393,16 @@ impl AttachmentService {
|
||||
if let Some(body) = body {
|
||||
builder = builder.json(&body)
|
||||
}
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
if let Some(private_key) = &self.private_key {
|
||||
println!("Getting claims for path {}", path);
|
||||
if let Some(required_claims) = Self::get_claims_for_path(&path)? {
|
||||
println!("Got claims {:?} for path {}", required_claims, path);
|
||||
let jwt_token = encode_from_key_file(&required_claims, private_key)?;
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let response = builder.send().await?;
|
||||
@@ -617,8 +522,8 @@ impl AttachmentService {
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn status(&self) -> anyhow::Result<()> {
|
||||
self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
|
||||
pub async fn ready(&self) -> anyhow::Result<()> {
|
||||
self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
|
||||
.await
|
||||
}
|
||||
|
||||
|
||||
@@ -8,14 +8,15 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::attachment_service::{
|
||||
AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
|
||||
};
|
||||
use control_plane::attachment_service::AttachmentService;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::{InitForceMode, LocalEnv};
|
||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
|
||||
@@ -590,6 +590,7 @@ impl Endpoint {
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
primary_is_running: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
|
||||
@@ -412,14 +412,17 @@ impl LocalEnv {
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
|
||||
let private_key_path = if self.private_key_path.is_absolute() {
|
||||
let private_key_path = self.get_private_key_path();
|
||||
let key_data = fs::read(private_key_path)?;
|
||||
encode_from_key_file(claims, &key_data)
|
||||
}
|
||||
|
||||
pub fn get_private_key_path(&self) -> PathBuf {
|
||||
if self.private_key_path.is_absolute() {
|
||||
self.private_key_path.to_path_buf()
|
||||
} else {
|
||||
self.base_data_dir.join(&self.private_key_path)
|
||||
};
|
||||
|
||||
let key_data = fs::read(private_key_path)?;
|
||||
encode_from_key_file(claims, &key_data)
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -17,6 +17,7 @@ use std::time::Duration;
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::controller_api::NodeRegisterRequest;
|
||||
use pageserver_api::models::{
|
||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
@@ -30,7 +31,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
|
||||
use crate::attachment_service::AttachmentService;
|
||||
use crate::local_env::PageServerConf;
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
|
||||
@@ -115,7 +116,7 @@ impl PageServerNode {
|
||||
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
|
||||
let jwt_token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
|
||||
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
||||
.unwrap();
|
||||
overrides.push(format!("control_plane_api_token='{}'", jwt_token));
|
||||
}
|
||||
@@ -210,6 +211,25 @@ impl PageServerNode {
|
||||
update_config: bool,
|
||||
register: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// Register the node with the storage controller before starting pageserver: pageserver must be registered to
|
||||
// successfully call /re-attach and finish starting up.
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
@@ -248,23 +268,6 @@ impl PageServerNode {
|
||||
)
|
||||
.await?;
|
||||
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -350,6 +353,11 @@ impl PageServerNode {
|
||||
.remove("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
compaction_algorithm: settings
|
||||
.remove("compaction_algorithm")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_algorithm' json")?,
|
||||
gc_horizon: settings
|
||||
.remove("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
@@ -389,11 +397,6 @@ impl PageServerNode {
|
||||
evictions_low_residence_duration_metric_threshold: settings
|
||||
.remove("evictions_low_residence_duration_metric_threshold")
|
||||
.map(|x| x.to_string()),
|
||||
gc_feedback: settings
|
||||
.remove("gc_feedback")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||
lazy_slru_download: settings
|
||||
.remove("lazy_slru_download")
|
||||
@@ -458,6 +461,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_threshold' as an integer")?,
|
||||
compaction_algorithm: settings
|
||||
.remove("compactin_algorithm")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_algorithm' json")?,
|
||||
gc_horizon: settings
|
||||
.remove("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
@@ -499,11 +507,6 @@ impl PageServerNode {
|
||||
evictions_low_residence_duration_metric_threshold: settings
|
||||
.remove("evictions_low_residence_duration_metric_threshold")
|
||||
.map(|x| x.to_string()),
|
||||
gc_feedback: settings
|
||||
.remove("gc_feedback")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||
lazy_slru_download: settings
|
||||
.remove("lazy_slru_download")
|
||||
|
||||
@@ -70,6 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
|
||||
Should only be used e.g. for status check.
|
||||
Currently also used for connection from any pageserver to any safekeeper.
|
||||
|
||||
"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
|
||||
|
||||
"admin": Provides access to the control plane and admin APIs of the attachment service.
|
||||
|
||||
### CLI
|
||||
CLI generates a key pair during call to `neon_local init` with the following commands:
|
||||
|
||||
@@ -79,6 +79,12 @@ pub struct ComputeSpec {
|
||||
// Stripe size for pageserver sharding, in pages
|
||||
#[serde(default)]
|
||||
pub shard_stripe_size: Option<usize>,
|
||||
|
||||
// When we are starting a new replica in hot standby mode,
|
||||
// we need to know if the primary is running.
|
||||
// This is used to determine if replica should wait for
|
||||
// RUNNING_XACTS from primary or not.
|
||||
pub primary_is_running: Option<bool>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -18,6 +18,7 @@ enum-map.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono.workspace = true
|
||||
|
||||
129
libs/pageserver_api/src/controller_api.rs
Normal file
129
libs/pageserver_api/src/controller_api.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
/// Request/response types for the storage controller
|
||||
/// API (`/control/v1` prefix). Implemented by the server
|
||||
/// in [`attachment_service::http`]
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{models::ShardParameters, shard::TenantShardId};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
pub generation: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponse {
|
||||
pub shards: Vec<TenantCreateResponseShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeRegisterRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeConfigureRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub availability: Option<NodeAvailability>,
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantLocateResponse {
|
||||
pub shards: Vec<TenantLocateResponseShard>,
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||
// secondary locations on this node still exist. Newly added nodes are in this
|
||||
// state until we successfully contact them.
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
||||
/// type needs to be defined with diesel traits in there.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
Filling,
|
||||
Pause,
|
||||
Draining,
|
||||
}
|
||||
|
||||
impl FromStr for NodeSchedulingPolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"filling" => Ok(Self::Filling),
|
||||
"pause" => Ok(Self::Pause),
|
||||
"draining" => Ok(Self::Draining),
|
||||
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NodeSchedulingPolicy> for String {
|
||||
fn from(value: NodeSchedulingPolicy) -> String {
|
||||
use NodeSchedulingPolicy::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Filling => "filling",
|
||||
Pause => "pause",
|
||||
Draining => "draining",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
@@ -307,6 +307,7 @@ impl KeySpaceRandomAccum {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
||||
let start = key_range.start;
|
||||
let end = key_range.end;
|
||||
|
||||
@@ -2,13 +2,14 @@
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use const_format::formatcp;
|
||||
|
||||
/// Public API types
|
||||
pub mod control_api;
|
||||
pub mod controller_api;
|
||||
pub mod key;
|
||||
pub mod keyspace;
|
||||
pub mod models;
|
||||
pub mod reltag;
|
||||
pub mod shard;
|
||||
/// Public API types
|
||||
pub mod upcall_api;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
pub mod partitioning;
|
||||
pub mod utilization;
|
||||
|
||||
pub use utilization::PageserverUtilization;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
@@ -269,6 +272,8 @@ pub struct TenantConfig {
|
||||
pub compaction_target_size: Option<u64>,
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
// defer parsing compaction_algorithm, like eviction_policy
|
||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
@@ -280,7 +285,6 @@ pub struct TenantConfig {
|
||||
pub eviction_policy: Option<EvictionPolicy>,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
||||
pub gc_feedback: Option<bool>,
|
||||
pub heatmap_period: Option<String>,
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
@@ -291,6 +295,7 @@ pub struct TenantConfig {
|
||||
pub enum EvictionPolicy {
|
||||
NoEviction,
|
||||
LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
|
||||
OnlyImitiate(EvictionPolicyLayerAccessThreshold),
|
||||
}
|
||||
|
||||
impl EvictionPolicy {
|
||||
@@ -298,10 +303,18 @@ impl EvictionPolicy {
|
||||
match self {
|
||||
EvictionPolicy::NoEviction => "NoEviction",
|
||||
EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
|
||||
EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum CompactionAlgorithm {
|
||||
Legacy,
|
||||
Tiered,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct EvictionPolicyLayerAccessThreshold {
|
||||
#[serde(with = "humantime_serde")]
|
||||
@@ -335,14 +348,14 @@ impl ThrottleConfig {
|
||||
}
|
||||
/// The requests per second allowed by the given config.
|
||||
pub fn steady_rps(&self) -> f64 {
|
||||
(self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
|
||||
(self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
|
||||
}
|
||||
}
|
||||
|
||||
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
|
||||
/// lists out all possible states (and the virtual "Detached" state)
|
||||
/// in a flat form rather than using rust-style enums.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum LocationConfigMode {
|
||||
AttachedSingle,
|
||||
AttachedMulti,
|
||||
@@ -406,6 +419,12 @@ pub struct TenantLocationConfigRequest {
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantTimeTravelRequest {
|
||||
pub shard_counts: Vec<ShardCount>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantShardLocation {
|
||||
|
||||
70
libs/pageserver_api/src/models/utilization.rs
Normal file
70
libs/pageserver_api/src/models/utilization.rs
Normal file
@@ -0,0 +1,70 @@
|
||||
use std::time::SystemTime;
|
||||
|
||||
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
|
||||
/// the next tenant.
|
||||
///
|
||||
/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
|
||||
///
|
||||
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
|
||||
/// not handle full u64 values properly.
|
||||
#[derive(serde::Serialize, Debug)]
|
||||
pub struct PageserverUtilization {
|
||||
/// Used disk space
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
pub disk_usage_bytes: u64,
|
||||
/// Free disk space
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
pub free_space_bytes: u64,
|
||||
/// Lower is better score for how good candidate for a next tenant would this pageserver be.
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
pub utilization_score: u64,
|
||||
/// When was this snapshot captured, pageserver local time.
|
||||
///
|
||||
/// Use millis to give confidence that the value is regenerated often enough.
|
||||
#[serde(serialize_with = "ser_rfc3339_millis")]
|
||||
pub captured_at: SystemTime,
|
||||
}
|
||||
|
||||
fn ser_rfc3339_millis<S: serde::Serializer>(
|
||||
ts: &SystemTime,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||
}
|
||||
|
||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||
///
|
||||
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
||||
/// with the highest bit set which is properly parsed by serde formats, but would create a
|
||||
/// conundrum on how to handle and again serialize such values at type level. It will be a few
|
||||
/// years until we can use more than `i64::MAX` bytes on a disk.
|
||||
fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
|
||||
|
||||
let value = (*value).min(MAX_FORMAT_INT64);
|
||||
|
||||
serializer.serialize_u64(value)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn u64_max_is_serialized_as_u63_max() {
|
||||
let doc = PageserverUtilization {
|
||||
disk_usage_bytes: u64::MAX,
|
||||
free_space_bytes: 0,
|
||||
utilization_score: u64::MAX,
|
||||
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
||||
};
|
||||
|
||||
let s = serde_json::to_string(&doc).unwrap();
|
||||
|
||||
let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
|
||||
|
||||
assert_eq!(s, expected);
|
||||
}
|
||||
}
|
||||
@@ -80,6 +80,9 @@ pub const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
|
||||
pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
|
||||
|
||||
// From standbydefs.h
|
||||
pub const XLOG_RUNNING_XACTS: u8 = 0x10;
|
||||
|
||||
// From srlu.h
|
||||
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
|
||||
pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
|
||||
|
||||
@@ -119,11 +119,6 @@ pub fn generate_pg_control(
|
||||
// Generate new pg_control needed for bootstrap
|
||||
checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
|
||||
|
||||
//reset some fields we don't want to preserve
|
||||
//TODO Check this.
|
||||
//We may need to determine the value from twophase data.
|
||||
checkpoint.oldestActiveXid = 0;
|
||||
|
||||
//save new values in pg_control
|
||||
pg_control.checkPoint = 0;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
|
||||
@@ -44,6 +44,26 @@ impl DownloadError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for DownloadError {
|
||||
fn from(value: std::io::Error) -> Self {
|
||||
let needs_unwrap = value.kind() == std::io::ErrorKind::Other
|
||||
&& value
|
||||
.get_ref()
|
||||
.and_then(|x| x.downcast_ref::<DownloadError>())
|
||||
.is_some();
|
||||
|
||||
if needs_unwrap {
|
||||
*value
|
||||
.into_inner()
|
||||
.expect("just checked")
|
||||
.downcast::<DownloadError>()
|
||||
.expect("just checked")
|
||||
} else {
|
||||
DownloadError::Other(value.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TimeTravelError {
|
||||
/// Validation or other error happened due to user input.
|
||||
@@ -142,13 +162,12 @@ impl std::fmt::Display for TimeoutOrCancel {
|
||||
impl std::error::Error for TimeoutOrCancel {}
|
||||
|
||||
impl TimeoutOrCancel {
|
||||
pub fn caused(error: &anyhow::Error) -> Option<&Self> {
|
||||
error.root_cause().downcast_ref()
|
||||
}
|
||||
|
||||
/// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
|
||||
pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
|
||||
Self::caused(error).is_some_and(Self::is_cancel)
|
||||
error
|
||||
.root_cause()
|
||||
.downcast_ref::<Self>()
|
||||
.is_some_and(Self::is_cancel)
|
||||
}
|
||||
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
|
||||
@@ -73,6 +73,8 @@ where
|
||||
if !*this.hit {
|
||||
if let Poll::Ready(e) = this.cancellation.poll(cx) {
|
||||
*this.hit = true;
|
||||
|
||||
// most likely this will be a std::io::Error wrapping a DownloadError
|
||||
let e = Err(std::io::Error::from(e));
|
||||
return Poll::Ready(Some(e));
|
||||
}
|
||||
@@ -130,6 +132,8 @@ mod tests {
|
||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||
"{inner:?}"
|
||||
);
|
||||
let e = DownloadError::from(e);
|
||||
assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
|
||||
|
||||
tokio::select! {
|
||||
_ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
|
||||
@@ -146,7 +150,7 @@ mod tests {
|
||||
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
|
||||
// because the stream uses 120s timeout we are paused, we advance to 120s right away.
|
||||
// because the stream uses 120s timeout and we are paused, we advance to 120s right away.
|
||||
let first = stream.next();
|
||||
|
||||
let e = first.await.expect("there must be some").unwrap_err();
|
||||
@@ -158,6 +162,8 @@ mod tests {
|
||||
.is_some_and(|e| matches!(e, DownloadError::Timeout)),
|
||||
"{inner:?}"
|
||||
);
|
||||
let e = DownloadError::from(e);
|
||||
assert!(matches!(e, DownloadError::Timeout), "{e:?}");
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
|
||||
@@ -32,6 +32,8 @@ pub enum Scope {
|
||||
// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
||||
#[serde(rename = "generations_api")]
|
||||
GenerationsApi,
|
||||
// Allows access to control plane managment API and some storage controller endpoints.
|
||||
Admin,
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
@@ -204,12 +206,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
// "scope": "tenant",
|
||||
// "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
|
||||
// "iss": "neon.controlplane",
|
||||
// "exp": 1709200879,
|
||||
// "iat": 1678442479
|
||||
// }
|
||||
// ```
|
||||
//
|
||||
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
|
||||
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";
|
||||
|
||||
// Check it can be validated with the public key
|
||||
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
fs::{self, File},
|
||||
io,
|
||||
io::{self, Write},
|
||||
};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
@@ -161,6 +161,48 @@ pub async fn durable_rename(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
|
||||
///
|
||||
/// The file is first written to the specified `tmp_path`, and in a second
|
||||
/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
|
||||
/// and atomic rename guarantee that, if we crash at any point, there will never
|
||||
/// be a partially written file at `final_path` (but maybe at `tmp_path`).
|
||||
///
|
||||
/// Callers are responsible for serializing calls of this function for a given `final_path`.
|
||||
/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
|
||||
/// be no error and the content of `final_path` will be the "winner" caller's `content`.
|
||||
/// I.e., the atomticity guarantees still hold.
|
||||
pub fn overwrite(
|
||||
final_path: &Utf8Path,
|
||||
tmp_path: &Utf8Path,
|
||||
content: &[u8],
|
||||
) -> std::io::Result<()> {
|
||||
let Some(final_path_parent) = final_path.parent() else {
|
||||
return Err(std::io::Error::from_raw_os_error(
|
||||
nix::errno::Errno::EINVAL as i32,
|
||||
));
|
||||
};
|
||||
std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
|
||||
let mut file = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
// Use `create_new` so that, if we race with ourselves or something else,
|
||||
// we bail out instead of causing damage.
|
||||
.create_new(true)
|
||||
.open(tmp_path)?;
|
||||
file.write_all(content)?;
|
||||
file.sync_all()?;
|
||||
drop(file); // don't keep the fd open for longer than we have to
|
||||
|
||||
std::fs::rename(tmp_path, final_path)?;
|
||||
|
||||
let final_parent_dirfd = std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(final_path_parent)?;
|
||||
|
||||
final_parent_dirfd.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -73,6 +73,7 @@ url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_compaction.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
|
||||
@@ -217,6 +217,20 @@ impl Client {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_time_travel_remote_storage(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timestamp: &str,
|
||||
done_if_after: &str,
|
||||
) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
self.request(Method::PUT, &uri, ()).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
||||
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, &uri, req).await?;
|
||||
|
||||
54
pageserver/compaction/Cargo.toml
Normal file
54
pageserver/compaction/Cargo.toml
Normal file
@@ -0,0 +1,54 @@
|
||||
[package]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crossbeam-utils.workspace = true
|
||||
either.workspace = true
|
||||
flate2.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
itertools.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
rand.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
51
pageserver/compaction/TODO.md
Normal file
51
pageserver/compaction/TODO.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# TODO
|
||||
|
||||
- If the key space can be perfectly partitioned at some key, perform planning on each
|
||||
partition separately. For example, if we are compacting a level with layers like this:
|
||||
|
||||
```
|
||||
:
|
||||
+--+ +----+ : +------+
|
||||
| | | | : | |
|
||||
+--+ +----+ : +------+
|
||||
:
|
||||
+-----+ +-+ : +--------+
|
||||
| | | | : | |
|
||||
+-----+ +-+ : +--------+
|
||||
:
|
||||
```
|
||||
|
||||
At the dotted line, there is a natural split in the key space, such that all
|
||||
layers are either on the left or the right of it. We can compact the
|
||||
partitions separately. We could choose to create image layers for one
|
||||
partition but not the other one, for example.
|
||||
|
||||
- All the layers don't have to be exactly the same size, we can choose to cut a
|
||||
layer short or stretch it a little larger than the target size, if it helps
|
||||
the overall system. We can help perfect partitions (see previous bullet point)
|
||||
to happen more frequently, by choosing the cut points wisely. For example, try
|
||||
to cut layers at boundaries of underlying image layers. And "snap to grid",
|
||||
i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
|
||||
|
||||
- Avoid rewriting layers when we'd just create an identical layer to an input
|
||||
layer.
|
||||
|
||||
- Parallelism. The code is already split up into planning and execution, so that
|
||||
we first split up the compaction work into "Jobs", and then execute them.
|
||||
It would be straightforward to execute multiple jobs in parallel.
|
||||
|
||||
- Materialize extra pages in delta layers during compaction. This would reduce
|
||||
read amplification. There has been the idea of partial image layers. Materializing
|
||||
extra pages in the delta layers achieve the same goal, without introducing a new
|
||||
concept.
|
||||
|
||||
## Simulator
|
||||
|
||||
- Expand the simulator for more workloads
|
||||
- Automate a test suite that runs the simluator with different workloads and
|
||||
spits out a table of results
|
||||
- Model read amplification
|
||||
- More sanity checking. One idea is to keep a reference count of each
|
||||
MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
|
||||
a MockRecord that is newer than PITR horizon is completely dropped. That would
|
||||
indicate that the record was lost.
|
||||
214
pageserver/compaction/src/bin/compaction-simulator.rs
Normal file
214
pageserver/compaction/src/bin/compaction-simulator.rs
Normal file
@@ -0,0 +1,214 @@
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_compaction::simulator::MockTimeline;
|
||||
use rand::Rng;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use utils::project_git_version;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(
|
||||
version = GIT_VERSION,
|
||||
about = "Neon Pageserver compaction simulator",
|
||||
long_about = "A developer tool to visualize and test compaction"
|
||||
)]
|
||||
#[command(propagate_version = true)]
|
||||
struct CliOpts {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
RunSuite,
|
||||
Simulate(SimulateCmd),
|
||||
}
|
||||
|
||||
#[derive(Clone, clap::ValueEnum)]
|
||||
enum Distribution {
|
||||
Uniform,
|
||||
HotCold,
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
#[derive(Parser)]
|
||||
struct SimulateCmd {
|
||||
distribution: Distribution,
|
||||
|
||||
/// Number of records to digest
|
||||
num_records: u64,
|
||||
/// Record length
|
||||
record_len: u64,
|
||||
|
||||
// Logical database size in MB
|
||||
logical_size: u64,
|
||||
}
|
||||
|
||||
async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
|
||||
let mut executor = MockTimeline::new();
|
||||
|
||||
// Convert the logical size in MB into a key range.
|
||||
let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
|
||||
//let key_range = u64::MIN..u64::MAX;
|
||||
println!(
|
||||
"starting simulation with key range {:016X}-{:016X}",
|
||||
key_range.start, key_range.end
|
||||
);
|
||||
|
||||
// helper function to print progress indicator
|
||||
let print_progress = |i| -> anyhow::Result<()> {
|
||||
if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
|
||||
print!(
|
||||
"\ringested {} / {} records, {} MiB / {} MiB...",
|
||||
i + 1,
|
||||
cmd.num_records,
|
||||
(i + 1) * cmd.record_len / (1_000_000),
|
||||
cmd.num_records * cmd.record_len / (1_000_000),
|
||||
);
|
||||
std::io::stdout().flush()?;
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
|
||||
match cmd.distribution {
|
||||
Distribution::Uniform => {
|
||||
for i in 0..cmd.num_records {
|
||||
executor.ingest_uniform(1, cmd.record_len, &key_range)?;
|
||||
executor.compact_if_needed().await?;
|
||||
|
||||
print_progress(i)?;
|
||||
}
|
||||
}
|
||||
Distribution::HotCold => {
|
||||
let splitpoint = key_range.start + (key_range.end - key_range.start) / 10;
|
||||
let hot_key_range = 0..splitpoint;
|
||||
let cold_key_range = splitpoint..key_range.end;
|
||||
|
||||
for i in 0..cmd.num_records {
|
||||
let chosen_range = if rand::thread_rng().gen_bool(0.9) {
|
||||
&hot_key_range
|
||||
} else {
|
||||
&cold_key_range
|
||||
};
|
||||
executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
|
||||
executor.compact_if_needed().await?;
|
||||
|
||||
print_progress(i)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
println!("done!");
|
||||
executor.flush_l0();
|
||||
executor.compact_if_needed().await?;
|
||||
let stats = executor.stats()?;
|
||||
|
||||
// Print the stats to stdout, and also to a file
|
||||
print!("{stats}");
|
||||
std::fs::write(results_path.join("stats.txt"), stats)?;
|
||||
|
||||
let animation_path = results_path.join("compaction-animation.html");
|
||||
executor.draw_history(std::fs::File::create(&animation_path)?)?;
|
||||
println!(
|
||||
"animation: file://{}",
|
||||
animation_path.canonicalize()?.display()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
|
||||
std::fs::create_dir(results_path)?;
|
||||
|
||||
set_log_file(File::create(results_path.join("log"))?);
|
||||
let result = simulate(workload, results_path).await;
|
||||
set_log_stdout();
|
||||
result
|
||||
}
|
||||
|
||||
async fn run_suite() -> anyhow::Result<()> {
|
||||
let top_results_path = PathBuf::from(format!(
|
||||
"compaction-suite-results.{}",
|
||||
std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
|
||||
));
|
||||
std::fs::create_dir(&top_results_path)?;
|
||||
|
||||
let workload = SimulateCmd {
|
||||
distribution: Distribution::Uniform,
|
||||
// Generate 20 GB of WAL
|
||||
record_len: 1_000,
|
||||
num_records: 20_000_000,
|
||||
// Logical size 5 GB
|
||||
logical_size: 5_000,
|
||||
};
|
||||
|
||||
run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
|
||||
|
||||
println!(
|
||||
"All tests finished. Results in {}",
|
||||
top_results_path.display()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::Stdout;
|
||||
use std::sync::Mutex;
|
||||
use tracing_subscriber::fmt::writer::EitherWriter;
|
||||
use tracing_subscriber::fmt::MakeWriter;
|
||||
|
||||
static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
|
||||
fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
|
||||
LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
|
||||
}
|
||||
|
||||
fn set_log_file(f: File) {
|
||||
*get_log_output().lock().unwrap() = EitherWriter::A(f);
|
||||
}
|
||||
|
||||
fn set_log_stdout() {
|
||||
*get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
|
||||
}
|
||||
|
||||
fn init_logging() -> anyhow::Result<()> {
|
||||
// We fall back to printing all spans at info-level or above if
|
||||
// the RUST_LOG environment variable is not set.
|
||||
let rust_log_env_filter = || {
|
||||
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
|
||||
};
|
||||
|
||||
// NB: the order of the with() calls does not matter.
|
||||
// See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
|
||||
use tracing_subscriber::prelude::*;
|
||||
tracing_subscriber::registry()
|
||||
.with({
|
||||
let log_layer = tracing_subscriber::fmt::layer()
|
||||
.with_target(false)
|
||||
.with_ansi(false)
|
||||
.with_writer(|| get_log_output().make_writer());
|
||||
log_layer.with_filter(rust_log_env_filter())
|
||||
})
|
||||
.init();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = CliOpts::parse();
|
||||
|
||||
init_logging()?;
|
||||
|
||||
match cli.command {
|
||||
Commands::Simulate(cmd) => {
|
||||
simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
|
||||
}
|
||||
Commands::RunSuite => {
|
||||
run_suite().await?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
866
pageserver/compaction/src/compact_tiered.rs
Normal file
866
pageserver/compaction/src/compact_tiered.rs
Normal file
@@ -0,0 +1,866 @@
|
||||
//! # Tiered compaction algorithm.
|
||||
//!
|
||||
//! Read all the input delta files, and write a new set of delta files that
|
||||
//! include all the input WAL records. See retile_deltas().
|
||||
//!
|
||||
//! In a "normal" LSM tree, you get to remove any values that are overwritten by
|
||||
//! later values, but in our system, we keep all the history. So the reshuffling
|
||||
//! doesn't remove any garbage, it just reshuffles the records to reduce read
|
||||
//! amplification, i.e. the number of files that you need to access to find the
|
||||
//! WAL records for a given key.
|
||||
//!
|
||||
//! If the new delta files would be very "narrow", i.e. each file would cover
|
||||
//! only a narrow key range, then we create a new set of image files
|
||||
//! instead. The current threshold is that if the estimated total size of the
|
||||
//! image layers is smaller than the size of the deltas, then we create image
|
||||
//! layers. That amounts to 2x storage amplification, and it means that the
|
||||
//! distance of image layers in LSN dimension is roughly equal to the logical
|
||||
//! database size. For example, if the logical database size is 10 GB, we would
|
||||
//! generate new image layers every 10 GB of WAL.
|
||||
use futures::StreamExt;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
|
||||
use crate::interface::*;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::identify_levels::identify_level;
|
||||
|
||||
/// Main entry point to compaction.
|
||||
///
|
||||
/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
|
||||
/// everything below that point, that needs compaction. The cutoff LSN must
|
||||
/// partition the layers so that there are no layers that span across that
|
||||
/// LSN. To start compaction at the top of the tree, pass the end LSN of the
|
||||
/// written last L0 layer.
|
||||
pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
executor: &mut E,
|
||||
end_lsn: Lsn,
|
||||
target_file_size: u64,
|
||||
fanout: u64,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(fanout >= 2);
|
||||
// Start at L0
|
||||
let mut current_level_no = 0;
|
||||
let mut current_level_target_height = target_file_size;
|
||||
loop {
|
||||
// end LSN +1 to include possible image layers exactly at 'end_lsn'.
|
||||
let all_layers = executor
|
||||
.get_layers(
|
||||
&(E::Key::MIN..E::Key::MAX),
|
||||
&(Lsn(u64::MIN)..end_lsn + 1),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
info!(
|
||||
"Compacting L{}, total # of layers: {}",
|
||||
current_level_no,
|
||||
all_layers.len()
|
||||
);
|
||||
|
||||
// Identify the range of LSNs that belong to this level. We assume that
|
||||
// each file in this level span an LSN range up to 1.75x target file
|
||||
// size. That should give us enough slop that if we created a slightly
|
||||
// oversized L0 layer, e.g. because flushing the in-memory layer was
|
||||
// delayed for some reason, we don't consider the oversized layer to
|
||||
// belong to L1. But not too much slop, that we don't accidentally
|
||||
// "skip" levels.
|
||||
let max_height = (current_level_target_height as f64 * 1.75) as u64;
|
||||
let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Calculate the height of this level. If the # of tiers exceeds the
|
||||
// fanout parameter, it's time to compact it.
|
||||
let depth = level.depth();
|
||||
info!(
|
||||
"Level {} identified as LSN range {}-{}: depth {}",
|
||||
current_level_no, level.lsn_range.start, level.lsn_range.end, depth
|
||||
);
|
||||
for l in &level.layers {
|
||||
debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
|
||||
}
|
||||
if depth < fanout {
|
||||
debug!(
|
||||
level = current_level_no,
|
||||
depth = depth,
|
||||
fanout,
|
||||
"too few deltas to compact"
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
compact_level(
|
||||
&level.lsn_range,
|
||||
&level.layers,
|
||||
executor,
|
||||
target_file_size,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
if target_file_size == u64::MAX {
|
||||
break;
|
||||
}
|
||||
current_level_no += 1;
|
||||
current_level_target_height = current_level_target_height.saturating_mul(fanout);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn compact_level<E: CompactionJobExecutor>(
|
||||
lsn_range: &Range<Lsn>,
|
||||
layers: &[E::Layer],
|
||||
executor: &mut E,
|
||||
target_file_size: u64,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<bool> {
|
||||
let mut layer_fragments = Vec::new();
|
||||
for l in layers {
|
||||
layer_fragments.push(LayerFragment::new(l.clone()));
|
||||
}
|
||||
|
||||
let mut state = LevelCompactionState {
|
||||
target_file_size,
|
||||
_lsn_range: lsn_range.clone(),
|
||||
layers: layer_fragments,
|
||||
jobs: Vec::new(),
|
||||
job_queue: Vec::new(),
|
||||
next_level: false,
|
||||
executor,
|
||||
};
|
||||
|
||||
let first_job = CompactionJob {
|
||||
key_range: E::Key::MIN..E::Key::MAX,
|
||||
lsn_range: lsn_range.clone(),
|
||||
strategy: CompactionStrategy::Divide,
|
||||
input_layers: state
|
||||
.layers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|i| LayerId(i.0))
|
||||
.collect(),
|
||||
completed: false,
|
||||
};
|
||||
|
||||
state.jobs.push(first_job);
|
||||
state.job_queue.push(JobId(0));
|
||||
state.execute(ctx).await?;
|
||||
|
||||
info!(
|
||||
"compaction completed! Need to process next level: {}",
|
||||
state.next_level
|
||||
);
|
||||
|
||||
Ok(state.next_level)
|
||||
}
|
||||
|
||||
/// Blackboard that keeps track of the state of all the jobs and work remaining
|
||||
struct LevelCompactionState<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
// parameters
|
||||
target_file_size: u64,
|
||||
|
||||
_lsn_range: Range<Lsn>,
|
||||
layers: Vec<LayerFragment<E>>,
|
||||
|
||||
// job queue
|
||||
jobs: Vec<CompactionJob<E>>,
|
||||
job_queue: Vec<JobId>,
|
||||
|
||||
/// If false, no need to compact levels below this
|
||||
next_level: bool,
|
||||
|
||||
/// Interface to the outside world
|
||||
executor: &'a mut E,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
struct LayerId(usize);
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
struct JobId(usize);
|
||||
|
||||
struct PendingJobSet {
|
||||
pending: HashSet<JobId>,
|
||||
completed: HashSet<JobId>,
|
||||
}
|
||||
|
||||
impl PendingJobSet {
|
||||
fn new() -> Self {
|
||||
PendingJobSet {
|
||||
pending: HashSet::new(),
|
||||
completed: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn complete_job(&mut self, job_id: JobId) {
|
||||
self.pending.remove(&job_id);
|
||||
self.completed.insert(job_id);
|
||||
}
|
||||
|
||||
fn all_completed(&self) -> bool {
|
||||
self.pending.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
// When we decide to rewrite a set of layers, LayerFragment is used to keep
|
||||
// track which new layers supersede an old layer. When all the stakeholder jobs
|
||||
// have completed, this layer can be deleted.
|
||||
struct LayerFragment<E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
layer: E::Layer,
|
||||
|
||||
// If we will write new layers to replace this one, this keeps track of the
|
||||
// jobs that need to complete before this layer can be deleted. As the jobs
|
||||
// complete, they are moved from 'pending' to 'completed' set. Once the
|
||||
// 'pending' set becomes empty, the layer can be deleted.
|
||||
//
|
||||
// If None, this layer is not rewritten and must not be deleted.
|
||||
deletable_after: Option<PendingJobSet>,
|
||||
|
||||
deleted: bool,
|
||||
}
|
||||
|
||||
impl<E> LayerFragment<E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
fn new(layer: E::Layer) -> Self {
|
||||
LayerFragment {
|
||||
layer,
|
||||
deletable_after: None,
|
||||
deleted: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum CompactionStrategy {
|
||||
Divide,
|
||||
CreateDelta,
|
||||
CreateImage,
|
||||
}
|
||||
|
||||
#[allow(dead_code)] // Todo
|
||||
struct CompactionJob<E: CompactionJobExecutor> {
|
||||
key_range: Range<E::Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
strategy: CompactionStrategy,
|
||||
|
||||
input_layers: Vec<LayerId>,
|
||||
|
||||
completed: bool,
|
||||
}
|
||||
|
||||
impl<'a, E> LevelCompactionState<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
/// Main loop of the executor.
|
||||
///
|
||||
/// In each iteration, we take the next job from the queue, and execute it.
|
||||
/// The execution might add new jobs to the queue. Keep going until the
|
||||
/// queue is empty.
|
||||
///
|
||||
/// Initially, the job queue consists of one Divide job over the whole
|
||||
/// level. On first call, it is divided into smaller jobs.
|
||||
async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
|
||||
// TODO: this would be pretty straightforward to parallelize with FuturesUnordered
|
||||
while let Some(next_job_id) = self.job_queue.pop() {
|
||||
info!("executing job {}", next_job_id.0);
|
||||
self.execute_job(next_job_id, ctx).await?;
|
||||
}
|
||||
|
||||
// all done!
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
match job.strategy {
|
||||
CompactionStrategy::Divide => {
|
||||
self.divide_job(job_id, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
CompactionStrategy::CreateDelta => {
|
||||
let mut deltas: Vec<E::DeltaLayer> = Vec::new();
|
||||
let mut layer_ids: Vec<LayerId> = Vec::new();
|
||||
for layer_id in &job.input_layers {
|
||||
let layer = &self.layers[layer_id.0].layer;
|
||||
if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
|
||||
deltas.push(dl.clone());
|
||||
layer_ids.push(*layer_id);
|
||||
}
|
||||
}
|
||||
|
||||
self.executor
|
||||
.create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
|
||||
.await?;
|
||||
self.jobs[job_id.0].completed = true;
|
||||
|
||||
// did we complete any fragments?
|
||||
for layer_id in layer_ids {
|
||||
let l = &mut self.layers[layer_id.0];
|
||||
if let Some(deletable_after) = l.deletable_after.as_mut() {
|
||||
deletable_after.complete_job(job_id);
|
||||
if deletable_after.all_completed() {
|
||||
self.executor.delete_layer(&l.layer, ctx).await?;
|
||||
l.deleted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.next_level = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
CompactionStrategy::CreateImage => {
|
||||
self.executor
|
||||
.create_image(job.lsn_range.end, &job.key_range, ctx)
|
||||
.await?;
|
||||
self.jobs[job_id.0].completed = true;
|
||||
|
||||
// TODO: we could check if any layers < PITR horizon became deletable
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
|
||||
let job_id = JobId(self.jobs.len());
|
||||
self.jobs.push(job);
|
||||
self.job_queue.push(job_id);
|
||||
job_id
|
||||
}
|
||||
|
||||
/// Take a partition of the key space, and decide how to compact it.
|
||||
///
|
||||
/// TODO: Currently, this is called exactly once for the level, and we
|
||||
/// decide whether to create new image layers to cover the whole level, or
|
||||
/// write a new set of delta. In the future, this should try to partition
|
||||
/// the key space, and make the decision separately for each partition.
|
||||
async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// Check for dummy cases
|
||||
if job.input_layers.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// Would it be better to create images for this partition?
|
||||
// Decide based on the average density of the level
|
||||
let keyspace_size = keyspace_total_size(
|
||||
&self
|
||||
.executor
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?,
|
||||
) * 8192;
|
||||
|
||||
let wal_size = job
|
||||
.input_layers
|
||||
.iter()
|
||||
.filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
|
||||
.map(|layer_id| self.layers[layer_id.0].layer.file_size())
|
||||
.sum::<u64>();
|
||||
if keyspace_size < wal_size {
|
||||
// seems worth it
|
||||
info!(
|
||||
"covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
|
||||
keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
|
||||
);
|
||||
self.cover_with_images(job_id, ctx).await
|
||||
} else {
|
||||
// do deltas
|
||||
info!(
|
||||
"coverage not worth it, keyspace_size {}, wal_size {}",
|
||||
keyspace_size, wal_size
|
||||
);
|
||||
self.retile_deltas(job_id, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
// LSN
|
||||
// ^
|
||||
// |
|
||||
// | ###|###|#####
|
||||
// | +--+-----+--+ +--+-----+--+
|
||||
// | | | | | | | | |
|
||||
// | +--+--+--+--+ +--+--+--+--+
|
||||
// | | | | | | |
|
||||
// | +---+-+-+---+ ==> +---+-+-+---+
|
||||
// | | | | | | | | |
|
||||
// | +---+-+-++--+ +---+-+-++--+
|
||||
// | | | | | | | | |
|
||||
// | +-----+--+--+ +-----+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
//
|
||||
async fn cover_with_images(
|
||||
&mut self,
|
||||
job_id: JobId,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// XXX: do we still need the "holes" stuff?
|
||||
|
||||
let mut new_jobs = Vec::new();
|
||||
|
||||
// Slide a window through the keyspace
|
||||
let keyspace = self
|
||||
.executor
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?;
|
||||
|
||||
let mut window = KeyspaceWindow::new(
|
||||
E::Key::MIN..E::Key::MAX,
|
||||
keyspace,
|
||||
self.target_file_size / 8192,
|
||||
);
|
||||
while let Some(key_range) = window.choose_next_image() {
|
||||
new_jobs.push(CompactionJob::<E> {
|
||||
key_range,
|
||||
lsn_range: job.lsn_range.clone(),
|
||||
strategy: CompactionStrategy::CreateImage,
|
||||
input_layers: Vec::new(), // XXX: Is it OK for this to be empty for image layer?
|
||||
completed: false,
|
||||
});
|
||||
}
|
||||
|
||||
for j in new_jobs.into_iter().rev() {
|
||||
let _job_id = self.push_job(j);
|
||||
|
||||
// TODO: image layers don't let us delete anything. unless < PITR horizon
|
||||
//let j = &self.jobs[job_id.0];
|
||||
// for layer_id in j.input_layers.iter() {
|
||||
// self.layers[layer_id.0].pending_stakeholders.insert(job_id);
|
||||
//}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Merge the contents of all the input delta layers into a new set
|
||||
// of delta layers, based on the current partitioning.
|
||||
//
|
||||
// We split the new delta layers on the key dimension. We iterate through
|
||||
// the key space, and for each key, check if including the next key to the
|
||||
// current output layer we're building would cause the layer to become too
|
||||
// large. If so, dump the current output layer and start new one. It's
|
||||
// possible that there is a single key with so many page versions that
|
||||
// storing all of them in a single layer file would be too large. In that
|
||||
// case, we also split on the LSN dimension.
|
||||
//
|
||||
// LSN
|
||||
// ^
|
||||
// |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ ==> | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
//
|
||||
//
|
||||
// If one key (X) has a lot of page versions:
|
||||
//
|
||||
// LSN
|
||||
// ^
|
||||
// | (X)
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | +--+ |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ ==> | | | | |
|
||||
// | | | | | +--+ |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
//
|
||||
// TODO: this actually divides the layers into fixed-size chunks, not
|
||||
// based on the partitioning.
|
||||
//
|
||||
// TODO: we should also opportunistically materialize and
|
||||
// garbage collect what we can.
|
||||
async fn retile_deltas(
|
||||
&mut self,
|
||||
job_id: JobId,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// Sweep the key space left to right, running an estimate of how much
|
||||
// disk size and keyspace we have accumulated
|
||||
//
|
||||
// Once the disk size reaches the target threshold, stop and think.
|
||||
// If we have accumulated only a narrow band of keyspace, create an
|
||||
// image layer. Otherwise write a delta layer.
|
||||
|
||||
// FIXME: deal with the case of lots of values for same key
|
||||
|
||||
// FIXME: we are ignoring images here. Did we already divide the work
|
||||
// so that we won't encounter them here?
|
||||
|
||||
let mut deltas: Vec<E::DeltaLayer> = Vec::new();
|
||||
for layer_id in &job.input_layers {
|
||||
let l = &self.layers[layer_id.0];
|
||||
if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
|
||||
deltas.push(dl.clone());
|
||||
}
|
||||
}
|
||||
// Open stream
|
||||
let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
|
||||
let mut new_jobs = Vec::new();
|
||||
|
||||
// Slide a window through the keyspace
|
||||
let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
|
||||
let mut all_in_window: bool = false;
|
||||
let mut window = Window::new();
|
||||
loop {
|
||||
if all_in_window && window.elems.is_empty() {
|
||||
// All done!
|
||||
break;
|
||||
}
|
||||
if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
|
||||
{
|
||||
let batch_layers: Vec<LayerId> = job
|
||||
.input_layers
|
||||
.iter()
|
||||
.filter(|layer_id| {
|
||||
overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
assert!(!batch_layers.is_empty());
|
||||
new_jobs.push(CompactionJob {
|
||||
key_range,
|
||||
lsn_range: job.lsn_range.clone(),
|
||||
strategy: CompactionStrategy::CreateDelta,
|
||||
input_layers: batch_layers,
|
||||
completed: false,
|
||||
});
|
||||
} else {
|
||||
assert!(!all_in_window);
|
||||
if let Some(next_key) = key_accum.next().await.transpose()? {
|
||||
window.feed(next_key.key, next_key.size);
|
||||
} else {
|
||||
all_in_window = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// All the input files are rewritten. Set up the tracking for when they can
|
||||
// be deleted.
|
||||
for layer_id in job.input_layers.iter() {
|
||||
let l = &mut self.layers[layer_id.0];
|
||||
assert!(l.deletable_after.is_none());
|
||||
l.deletable_after = Some(PendingJobSet::new());
|
||||
}
|
||||
for j in new_jobs.into_iter().rev() {
|
||||
let job_id = self.push_job(j);
|
||||
let j = &self.jobs[job_id.0];
|
||||
for layer_id in j.input_layers.iter() {
|
||||
self.layers[layer_id.0]
|
||||
.deletable_after
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.pending
|
||||
.insert(job_id);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
// This is used by over_with_images to decide on good split points
|
||||
struct KeyspaceWindow<K> {
|
||||
head: KeyspaceWindowHead<K>,
|
||||
|
||||
start_pos: KeyspaceWindowPos<K>,
|
||||
}
|
||||
struct KeyspaceWindowHead<K> {
|
||||
// overall key range to cover
|
||||
key_range: Range<K>,
|
||||
|
||||
keyspace: Vec<Range<K>>,
|
||||
target_keysize: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct KeyspaceWindowPos<K> {
|
||||
end_key: K,
|
||||
|
||||
keyspace_idx: usize,
|
||||
|
||||
accum_keysize: u64,
|
||||
}
|
||||
impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
||||
fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
|
||||
self.keyspace_idx == w.keyspace.len()
|
||||
}
|
||||
|
||||
// Advance the cursor until it reaches 'target_keysize'.
|
||||
fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
|
||||
while self.accum_keysize < max_size && !self.reached_end(w) {
|
||||
let curr_range = &w.keyspace[self.keyspace_idx];
|
||||
if self.end_key < curr_range.start {
|
||||
// skip over any unused space
|
||||
self.end_key = curr_range.start;
|
||||
}
|
||||
|
||||
// We're now within 'curr_range'. Can we advance past it completely?
|
||||
let distance = K::key_range_size(&(self.end_key..curr_range.end));
|
||||
if (self.accum_keysize + distance as u64) < max_size {
|
||||
// oh yeah, it fits
|
||||
self.end_key = curr_range.end;
|
||||
self.keyspace_idx += 1;
|
||||
self.accum_keysize += distance as u64;
|
||||
} else {
|
||||
// advance within the range
|
||||
let skip_key = self.end_key.skip_some();
|
||||
let distance = K::key_range_size(&(self.end_key..skip_key));
|
||||
if (self.accum_keysize + distance as u64) < max_size {
|
||||
self.end_key = skip_key;
|
||||
self.accum_keysize += distance as u64;
|
||||
} else {
|
||||
self.end_key = self.end_key.next();
|
||||
self.accum_keysize += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K> KeyspaceWindow<K>
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
|
||||
assert!(keyspace.first().unwrap().start >= key_range.start);
|
||||
|
||||
let start_key = key_range.start;
|
||||
let start_pos = KeyspaceWindowPos::<K> {
|
||||
end_key: start_key,
|
||||
keyspace_idx: 0,
|
||||
accum_keysize: 0,
|
||||
};
|
||||
Self {
|
||||
head: KeyspaceWindowHead::<K> {
|
||||
key_range,
|
||||
keyspace,
|
||||
target_keysize,
|
||||
},
|
||||
start_pos,
|
||||
}
|
||||
}
|
||||
|
||||
fn choose_next_image(&mut self) -> Option<Range<K>> {
|
||||
if self.start_pos.keyspace_idx == self.head.keyspace.len() {
|
||||
// we've reached the end
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut next_pos = self.start_pos.clone();
|
||||
next_pos.advance_until_size(
|
||||
&self.head,
|
||||
self.start_pos.accum_keysize + self.head.target_keysize,
|
||||
);
|
||||
|
||||
// See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
|
||||
// 1.25x target size
|
||||
let mut end_pos = next_pos.clone();
|
||||
end_pos.advance_until_size(
|
||||
&self.head,
|
||||
self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
|
||||
);
|
||||
if end_pos.reached_end(&self.head) {
|
||||
// gobble up any unused keyspace between the last used key and end of the range
|
||||
assert!(end_pos.end_key <= self.head.key_range.end);
|
||||
end_pos.end_key = self.head.key_range.end;
|
||||
next_pos = end_pos;
|
||||
}
|
||||
|
||||
let start_key = self.start_pos.end_key;
|
||||
self.start_pos = next_pos;
|
||||
Some(start_key..self.start_pos.end_key)
|
||||
}
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
//
|
||||
// This is used to decide what layer to write next, from the beginning of the window.
|
||||
//
|
||||
// Candidates:
|
||||
//
|
||||
// 1. Create an image layer, snapping to previous images
|
||||
// 2. Create a delta layer, snapping to previous images
|
||||
// 3. Create an image layer, snapping to
|
||||
//
|
||||
//
|
||||
|
||||
// Take previous partitioning, based on the image layers below.
|
||||
//
|
||||
// Candidate is at the front:
|
||||
//
|
||||
// Consider stretching an image layer to next divider? If it's close enough,
|
||||
// that's the image candidate
|
||||
//
|
||||
// If it's too far, consider splitting at a reasonable point
|
||||
//
|
||||
// Is the image candidate smaller than the equivalent delta? If so,
|
||||
// split off the image. Otherwise, split off one delta.
|
||||
// Try to snap off the delta at a reasonable point
|
||||
|
||||
struct WindowElement<K> {
|
||||
start_key: K, // inclusive
|
||||
last_key: K, // inclusive
|
||||
accum_size: u64,
|
||||
}
|
||||
struct Window<K> {
|
||||
elems: VecDeque<WindowElement<K>>,
|
||||
|
||||
// last key that was split off, inclusive
|
||||
splitoff_key: Option<K>,
|
||||
splitoff_size: u64,
|
||||
}
|
||||
|
||||
impl<K> Window<K>
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
elems: VecDeque::new(),
|
||||
splitoff_key: None,
|
||||
splitoff_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn feed(&mut self, key: K, size: u64) {
|
||||
let last_size;
|
||||
if let Some(last) = self.elems.back_mut() {
|
||||
assert!(last.last_key <= key);
|
||||
if key == last.last_key {
|
||||
last.accum_size += size;
|
||||
return;
|
||||
}
|
||||
last_size = last.accum_size;
|
||||
} else {
|
||||
last_size = 0;
|
||||
}
|
||||
// This is a new key.
|
||||
let elem = WindowElement {
|
||||
start_key: key,
|
||||
last_key: key,
|
||||
accum_size: last_size + size,
|
||||
};
|
||||
self.elems.push_back(elem);
|
||||
}
|
||||
|
||||
fn remain_size(&self) -> u64 {
|
||||
self.elems.back().unwrap().accum_size - self.splitoff_size
|
||||
}
|
||||
|
||||
fn peek_size(&self) -> u64 {
|
||||
self.elems.front().unwrap().accum_size - self.splitoff_size
|
||||
}
|
||||
|
||||
fn commit_upto(&mut self, mut upto: usize) {
|
||||
while upto > 1 {
|
||||
let popped = self.elems.pop_front().unwrap();
|
||||
self.elems.front_mut().unwrap().start_key = popped.start_key;
|
||||
upto -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn find_size_split(&self, target_size: u64) -> usize {
|
||||
self.elems
|
||||
.partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
|
||||
}
|
||||
|
||||
fn pop(&mut self) {
|
||||
let first = self.elems.pop_front().unwrap();
|
||||
self.splitoff_size = first.accum_size;
|
||||
|
||||
self.splitoff_key = Some(first.last_key);
|
||||
}
|
||||
|
||||
// the difference between delta and image is that an image covers
|
||||
// any unused keyspace before and after, while a delta tries to
|
||||
// minimize that. TODO: difference not implemented
|
||||
fn pop_delta(&mut self) -> Range<K> {
|
||||
let first = self.elems.front().unwrap();
|
||||
let key_range = first.start_key..first.last_key.next();
|
||||
|
||||
self.pop();
|
||||
key_range
|
||||
}
|
||||
|
||||
// Prerequisite: we have enough input in the window
|
||||
//
|
||||
// On return None, the caller should feed more data and call again
|
||||
fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
|
||||
if has_more && self.elems.is_empty() {
|
||||
// Starting up
|
||||
return None;
|
||||
}
|
||||
|
||||
// If we still have an undersized candidate, just keep going
|
||||
while self.peek_size() < target_size {
|
||||
if self.elems.len() > 1 {
|
||||
self.commit_upto(2);
|
||||
} else if has_more {
|
||||
return None;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure we have enough input in the window to make a good decision
|
||||
if has_more && self.remain_size() < target_size * 5 / 4 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// The candidate on the front is now large enough, for a delta.
|
||||
// And we have enough data in the window to decide.
|
||||
|
||||
// If we're willing to stretch it up to 1.25 target size, could we
|
||||
// gobble up the rest of the work? This avoids creating very small
|
||||
// "tail" layers at the end of the keyspace
|
||||
if !has_more && self.remain_size() < target_size * 5 / 3 {
|
||||
self.commit_upto(self.elems.len());
|
||||
} else {
|
||||
let delta_split_at = self.find_size_split(target_size);
|
||||
self.commit_upto(delta_split_at);
|
||||
|
||||
// If it's still not large enough, request the caller to fill the window
|
||||
if self.elems.len() == 1 && has_more {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
Some(self.pop_delta())
|
||||
}
|
||||
}
|
||||
243
pageserver/compaction/src/helpers.rs
Normal file
243
pageserver/compaction/src/helpers.rs
Normal file
@@ -0,0 +1,243 @@
|
||||
//! This file contains generic utility functions over the interface types,
|
||||
//! which could be handy for any compaction implementation.
|
||||
use crate::interface::*;
|
||||
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{Stream, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use pin_project_lite::pin_project;
|
||||
use std::cmp::Ord;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::VecDeque;
|
||||
use std::future::Future;
|
||||
use std::ops::{DerefMut, Range};
|
||||
use std::pin::Pin;
|
||||
use std::task::{ready, Poll};
|
||||
|
||||
pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
|
||||
}
|
||||
|
||||
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||
!(a.end <= b.start || b.end <= a.start)
|
||||
}
|
||||
|
||||
pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
|
||||
let x = std::mem::take(a);
|
||||
let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
|
||||
.into_iter()
|
||||
.kmerge_by(|a, b| a.start < b.start);
|
||||
let mut ranges = Vec::new();
|
||||
if let Some(first) = all_ranges_iter.next() {
|
||||
let (mut start, mut end) = (first.start, first.end);
|
||||
|
||||
for r in all_ranges_iter {
|
||||
assert!(r.start >= start);
|
||||
if r.start > end {
|
||||
ranges.push(start..end);
|
||||
start = r.start;
|
||||
end = r.end;
|
||||
} else if r.end > end {
|
||||
end = r.end;
|
||||
}
|
||||
}
|
||||
ranges.push(start..end);
|
||||
}
|
||||
*a = ranges
|
||||
}
|
||||
|
||||
pub fn intersect_keyspace<K: Ord + Clone + Copy>(
|
||||
a: &CompactionKeySpace<K>,
|
||||
r: &Range<K>,
|
||||
) -> CompactionKeySpace<K> {
|
||||
let mut ranges: Vec<Range<K>> = Vec::new();
|
||||
|
||||
for x in a.iter() {
|
||||
if x.end <= r.start {
|
||||
continue;
|
||||
}
|
||||
if x.start >= r.end {
|
||||
break;
|
||||
}
|
||||
ranges.push(x.clone())
|
||||
}
|
||||
|
||||
// trim the ends
|
||||
if let Some(first) = ranges.first_mut() {
|
||||
first.start = std::cmp::max(first.start, r.start);
|
||||
}
|
||||
if let Some(last) = ranges.last_mut() {
|
||||
last.end = std::cmp::min(last.end, r.end);
|
||||
}
|
||||
ranges
|
||||
}
|
||||
|
||||
/// Create a stream that iterates through all DeltaEntrys among all input
|
||||
/// layers, in key-lsn order.
|
||||
///
|
||||
/// This is public because the create_delta() implementation likely wants to use this too
|
||||
/// TODO: move to a more shared place
|
||||
pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
|
||||
layers: &'a [E::DeltaLayer],
|
||||
ctx: &'a E::RequestContext,
|
||||
) -> MergeDeltaKeys<'a, E> {
|
||||
// Use a binary heap to merge the layers. Each input layer is initially
|
||||
// represented by a LazyLoadLayer::Unloaded element, which uses the start of
|
||||
// the layer's key range as the key. The first time a layer reaches the top
|
||||
// of the heap, all the keys of the layer are loaded into a sorted vector.
|
||||
//
|
||||
// This helps to keep the memory usage reasonable: we only need to hold in
|
||||
// memory the DeltaEntrys of the layers that overlap with the "current" key.
|
||||
let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
|
||||
for l in layers {
|
||||
heap.push(LazyLoadLayer::Unloaded(l));
|
||||
}
|
||||
MergeDeltaKeys {
|
||||
heap,
|
||||
ctx,
|
||||
load_future: None,
|
||||
}
|
||||
}
|
||||
|
||||
enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
|
||||
Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
|
||||
Unloaded(&'a E::DeltaLayer),
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
|
||||
fn key(&self) -> E::Key {
|
||||
match self {
|
||||
Self::Loaded(entries) => entries.front().unwrap().key(),
|
||||
Self::Unloaded(dl) => dl.key_range().start,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
// reverse order so that we get a min-heap
|
||||
other.key().cmp(&self.key())
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.key().eq(&other.key())
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
|
||||
|
||||
type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
|
||||
|
||||
// Stream returned by `merge_delta_keys`
|
||||
pin_project! {
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
|
||||
heap: BinaryHeap<LazyLoadLayer<'a, E>>,
|
||||
|
||||
#[pin]
|
||||
load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
|
||||
|
||||
ctx: &'a E::RequestContext,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, E> Stream for MergeDeltaKeys<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor + 'a,
|
||||
{
|
||||
type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
|
||||
|
||||
fn poll_next(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
|
||||
let mut this = self.project();
|
||||
loop {
|
||||
if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
|
||||
// We are waiting for loading the keys to finish
|
||||
match ready!(load_future.as_mut().poll(cx)) {
|
||||
Ok(entries) => {
|
||||
this.load_future.set(None);
|
||||
*this.heap.peek_mut().unwrap() =
|
||||
LazyLoadLayer::Loaded(VecDeque::from(entries));
|
||||
}
|
||||
Err(e) => {
|
||||
return Poll::Ready(Some(Err(e)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the topmost layer in the heap hasn't been loaded yet, start
|
||||
// loading it. Otherwise return the next entry from it and update
|
||||
// the layer's position in the heap (this decreaseKey operation is
|
||||
// performed implicitly when `top` is dropped).
|
||||
if let Some(mut top) = this.heap.peek_mut() {
|
||||
match top.deref_mut() {
|
||||
LazyLoadLayer::Unloaded(ref mut l) => {
|
||||
let fut = l.load_keys(this.ctx);
|
||||
this.load_future.set(Some(fut));
|
||||
continue;
|
||||
}
|
||||
LazyLoadLayer::Loaded(ref mut entries) => {
|
||||
let result = entries.pop_front().unwrap();
|
||||
if entries.is_empty() {
|
||||
std::collections::binary_heap::PeekMut::pop(top);
|
||||
}
|
||||
return Poll::Ready(Some(Ok(result)));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Poll::Ready(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Accumulate values at key boundaries
|
||||
pub struct KeySize<K> {
|
||||
pub key: K,
|
||||
pub num_values: u64,
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
|
||||
where
|
||||
K: Eq,
|
||||
I: Stream<Item = Result<D, E>>,
|
||||
D: CompactionDeltaEntry<'a, K>,
|
||||
{
|
||||
async_stream::try_stream! {
|
||||
// Initialize the state from the first value
|
||||
let mut input = std::pin::pin!(input);
|
||||
|
||||
if let Some(first) = input.next().await {
|
||||
let first = first?;
|
||||
let mut accum: KeySize<K> = KeySize {
|
||||
key: first.key(),
|
||||
num_values: 1,
|
||||
size: first.size(),
|
||||
};
|
||||
while let Some(this) = input.next().await {
|
||||
let this = this?;
|
||||
if this.key() == accum.key {
|
||||
accum.size += this.size();
|
||||
accum.num_values += 1;
|
||||
} else {
|
||||
yield accum;
|
||||
accum = KeySize {
|
||||
key: this.key(),
|
||||
num_values: 1,
|
||||
size: this.size(),
|
||||
};
|
||||
}
|
||||
}
|
||||
yield accum;
|
||||
}
|
||||
}
|
||||
}
|
||||
376
pageserver/compaction/src/identify_levels.rs
Normal file
376
pageserver/compaction/src/identify_levels.rs
Normal file
@@ -0,0 +1,376 @@
|
||||
//! An LSM tree consists of multiple levels, each exponential larger than the
|
||||
//! previous level. And each level consists of be multiple "tiers". With tiered
|
||||
//! compaction, a level is compacted when it has accumulated more than N tiers,
|
||||
//! forming one tier on the next level.
|
||||
//!
|
||||
//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
|
||||
//! we identify them by looking at the shapes of the layers. It's an easy task
|
||||
//! for a human, but it's not straightforward to come up with the exact
|
||||
//! rules. Especially if there are cases like interrupted, half-finished
|
||||
//! compactions, or highly skewed data distributions that have let us "skip"
|
||||
//! some levels. It's not critical to classify all cases correctly; at worst we
|
||||
//! delay some compaction work, and suffer from more read amplification, or we
|
||||
//! perform some unnecessary compaction work.
|
||||
//!
|
||||
//! `identify_level` performs that shape-matching.
|
||||
//!
|
||||
//! It returns a Level struct, which has `depth()` function to count the number
|
||||
//! of "tiers" in the level. The tier count is the max depth of stacked layers
|
||||
//! within the level. That's a good measure, because the point of compacting is
|
||||
//! to reduce read amplification, and the depth is what determines that.
|
||||
//!
|
||||
//! One interesting effect of this is that if we generate very small delta
|
||||
//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
|
||||
//! because they reach the target size, the L0 compaction will combine them to
|
||||
//! one larger file. But if the combined file is still smaller than the target
|
||||
//! file size, the file will still be considered to be part of L0 at the next
|
||||
//! iteration.
|
||||
|
||||
use anyhow::bail;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::interface::*;
|
||||
|
||||
use tracing::{info, trace};
|
||||
|
||||
pub struct Level<L> {
|
||||
pub lsn_range: Range<Lsn>,
|
||||
pub layers: Vec<L>,
|
||||
}
|
||||
|
||||
/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
|
||||
/// no layers that cross the boundary LSN.
|
||||
///
|
||||
/// A further restriction is that all layers in the returned partition cover at
|
||||
/// most 'lsn_max_size' LSN bytes.
|
||||
pub async fn identify_level<K, L>(
|
||||
all_layers: Vec<L>,
|
||||
end_lsn: Lsn,
|
||||
lsn_max_size: u64,
|
||||
) -> anyhow::Result<Option<Level<L>>>
|
||||
where
|
||||
K: CompactionKey,
|
||||
L: CompactionLayer<K> + Clone,
|
||||
{
|
||||
// filter out layers that are above the `end_lsn`, they are completely irrelevant.
|
||||
let mut layers = Vec::new();
|
||||
for l in all_layers {
|
||||
if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
|
||||
// shouldn't happen. Indicates that the caller passed a bogus
|
||||
// end_lsn.
|
||||
bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
|
||||
}
|
||||
// include image layers sitting exacty at `end_lsn`.
|
||||
let is_image = !l.is_delta();
|
||||
if (is_image && l.lsn_range().start > end_lsn)
|
||||
|| (!is_image && l.lsn_range().start >= end_lsn)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
layers.push(l);
|
||||
}
|
||||
// All the remaining layers either belong to this level, or are below it.
|
||||
info!(
|
||||
"identify level at {}, size {}, num layers below: {}",
|
||||
end_lsn,
|
||||
lsn_max_size,
|
||||
layers.len()
|
||||
);
|
||||
if layers.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Walk the ranges in LSN order.
|
||||
//
|
||||
// ----- end_lsn
|
||||
// |
|
||||
// |
|
||||
// v
|
||||
//
|
||||
layers.sort_by_key(|l| l.lsn_range().end);
|
||||
let mut candidate_start_lsn = end_lsn;
|
||||
let mut candidate_layers: Vec<L> = Vec::new();
|
||||
let mut current_best_start_lsn = end_lsn;
|
||||
let mut current_best_layers: Vec<L> = Vec::new();
|
||||
let mut iter = layers.into_iter();
|
||||
loop {
|
||||
let Some(l) = iter.next_back() else {
|
||||
// Reached end. Accept the last candidate
|
||||
current_best_start_lsn = candidate_start_lsn;
|
||||
current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
|
||||
break;
|
||||
};
|
||||
trace!(
|
||||
"inspecting {} for candidate {}, current best {}",
|
||||
l.short_id(),
|
||||
candidate_start_lsn,
|
||||
current_best_start_lsn
|
||||
);
|
||||
|
||||
let r = l.lsn_range();
|
||||
|
||||
// Image layers don't restrict our choice of cutoff LSN
|
||||
if l.is_delta() {
|
||||
// Is this candidate workable? In other words, are there any
|
||||
// delta layers that span across this LSN
|
||||
//
|
||||
// Valid: Not valid:
|
||||
// + +
|
||||
// | | +
|
||||
// + <- candidate + | <- candidate
|
||||
// + +
|
||||
// |
|
||||
// +
|
||||
if r.end <= candidate_start_lsn {
|
||||
// Hooray, there are no crossing LSNs. And we have visited
|
||||
// through all the layers within candidate..end_lsn. The
|
||||
// current candidate can be accepted.
|
||||
current_best_start_lsn = r.end;
|
||||
current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
|
||||
candidate_start_lsn = r.start;
|
||||
}
|
||||
|
||||
// Is it small enough to be considered part of this level?
|
||||
if r.end.0 - r.start.0 > lsn_max_size {
|
||||
// Too large, this layer belongs to next level. Stop.
|
||||
trace!(
|
||||
"too large {}, size {} vs {}",
|
||||
l.short_id(),
|
||||
r.end.0 - r.start.0,
|
||||
lsn_max_size
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
// If this crosses the candidate lsn, push it down.
|
||||
if r.start < candidate_start_lsn {
|
||||
trace!(
|
||||
"layer {} prevents from stopping at {}",
|
||||
l.short_id(),
|
||||
candidate_start_lsn
|
||||
);
|
||||
candidate_start_lsn = r.start;
|
||||
}
|
||||
}
|
||||
|
||||
// Include this layer in our candidate
|
||||
candidate_layers.push(l);
|
||||
}
|
||||
|
||||
Ok(if current_best_start_lsn == end_lsn {
|
||||
// empty level
|
||||
None
|
||||
} else {
|
||||
Some(Level {
|
||||
lsn_range: current_best_start_lsn..end_lsn,
|
||||
layers: current_best_layers,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// helper struct used in depth()
|
||||
struct Event<K> {
|
||||
key: K,
|
||||
layer_idx: usize,
|
||||
start: bool,
|
||||
}
|
||||
|
||||
impl<L> Level<L> {
|
||||
/// Count the number of deltas stacked on each other.
|
||||
pub fn depth<K>(&self) -> u64
|
||||
where
|
||||
K: CompactionKey,
|
||||
L: CompactionLayer<K>,
|
||||
{
|
||||
let mut events: Vec<Event<K>> = Vec::new();
|
||||
for (idx, l) in self.layers.iter().enumerate() {
|
||||
events.push(Event {
|
||||
key: l.key_range().start,
|
||||
layer_idx: idx,
|
||||
start: true,
|
||||
});
|
||||
events.push(Event {
|
||||
key: l.key_range().end,
|
||||
layer_idx: idx,
|
||||
start: false,
|
||||
});
|
||||
}
|
||||
events.sort_by_key(|e| (e.key, e.start));
|
||||
|
||||
// Sweep the key space left to right. Stop at each distinct key, and
|
||||
// count the number of deltas on top of the highest image at that key.
|
||||
//
|
||||
// This is a little enefficient, as we walk through the active_set on
|
||||
// every key. We could increment/decrement a counter on each step
|
||||
// instead, but that'd require a bit more complex bookkeeping.
|
||||
let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
|
||||
let mut max_depth = 0;
|
||||
let mut events_iter = events.iter().peekable();
|
||||
while let Some(e) = events_iter.next() {
|
||||
let l = &self.layers[e.layer_idx];
|
||||
let is_image = !l.is_delta();
|
||||
|
||||
// update the active set
|
||||
if e.start {
|
||||
active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
|
||||
} else {
|
||||
active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
|
||||
}
|
||||
|
||||
// recalculate depth if this was the last event at this point
|
||||
let more_events_at_this_key = events_iter
|
||||
.peek()
|
||||
.map_or(false, |next_e| next_e.key == e.key);
|
||||
if !more_events_at_this_key {
|
||||
let mut active_depth = 0;
|
||||
for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
|
||||
if *is_image {
|
||||
break;
|
||||
}
|
||||
active_depth += 1;
|
||||
}
|
||||
if active_depth > max_depth {
|
||||
max_depth = active_depth;
|
||||
}
|
||||
}
|
||||
}
|
||||
max_depth
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
|
||||
MockLayer::Delta(Arc::new(MockDeltaLayer {
|
||||
key_range,
|
||||
lsn_range,
|
||||
// identify_level() doesn't pay attention to the rest of the fields
|
||||
file_size: 0,
|
||||
deleted: Mutex::new(false),
|
||||
records: vec![],
|
||||
}))
|
||||
}
|
||||
|
||||
fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
|
||||
MockLayer::Image(Arc::new(MockImageLayer {
|
||||
key_range,
|
||||
lsn_range: lsn..(lsn + 1),
|
||||
// identify_level() doesn't pay attention to the rest of the fields
|
||||
file_size: 0,
|
||||
deleted: Mutex::new(false),
|
||||
}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_identify_level() -> anyhow::Result<()> {
|
||||
let layers = vec![
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
|
||||
];
|
||||
|
||||
// All layers fit in the max file size
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 6);
|
||||
|
||||
// Same LSN with smaller max file size. The second layer from the top is larger
|
||||
// and belongs to next level.
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 1);
|
||||
|
||||
// Call with a smaller LSN
|
||||
let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 2);
|
||||
|
||||
// Call with an LSN that doesn't partition the space
|
||||
let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
|
||||
assert!(result.is_err());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
|
||||
// The files LSN ranges overlap, so even though there are more files that
|
||||
// fit under the file size, they are not included in the level because they
|
||||
// overlap so that we'd need to include the oldest file, too, which is
|
||||
// larger
|
||||
let layers = vec![
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
|
||||
// The key ranges don't overlap, so depth is only 1.
|
||||
let layers = vec![
|
||||
delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
|
||||
delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
|
||||
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.layers.len(), 3);
|
||||
assert_eq!(level.depth(), 1);
|
||||
|
||||
// Staggered. The 1st and 3rd layer don't overlap with each other.
|
||||
let layers = vec![
|
||||
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
|
||||
delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
|
||||
delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.layers.len(), 3);
|
||||
assert_eq!(level.depth(), 2);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_depth_images() -> anyhow::Result<()> {
|
||||
let layers: Vec<MockLayer> = vec![
|
||||
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
|
||||
delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
|
||||
delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
|
||||
// This covers the same key range as the 2nd delta layer. The depth
|
||||
// in that key range is therefore 0.
|
||||
image(1500..2500, Lsn(0x9000)),
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.layers.len(), 4);
|
||||
assert_eq!(level.depth(), 1);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
167
pageserver/compaction/src/interface.rs
Normal file
167
pageserver/compaction/src/interface.rs
Normal file
@@ -0,0 +1,167 @@
|
||||
//! This is what the compaction implementation needs to know about
|
||||
//! layers, keyspace etc.
|
||||
//!
|
||||
//! All the heavy lifting is done by the create_image and create_delta
|
||||
//! functions that the implementor provides.
|
||||
use async_trait::async_trait;
|
||||
use pageserver_api::{key::Key, keyspace::key_range_size};
|
||||
use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Public interface. This is the main thing that the implementor needs to provide
|
||||
#[async_trait]
|
||||
pub trait CompactionJobExecutor {
|
||||
// Type system.
|
||||
//
|
||||
// We assume that there are two kinds of layers, deltas and images. The
|
||||
// compaction doesn't distinguish whether they are stored locally or
|
||||
// remotely.
|
||||
//
|
||||
// The keyspace is defined by CompactionKey trait.
|
||||
//
|
||||
type Key: CompactionKey;
|
||||
|
||||
type Layer: CompactionLayer<Self::Key> + Clone;
|
||||
type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
|
||||
type ImageLayer: CompactionImageLayer<Self> + Clone;
|
||||
|
||||
// This is passed through to all the interface functions. The compaction
|
||||
// implementation doesn't do anything with it, but it might be useful for
|
||||
// the interface implementation.
|
||||
type RequestContext: CompactionRequestContext;
|
||||
|
||||
// ----
|
||||
// Functions that the planner uses to support its decisions
|
||||
// ----
|
||||
|
||||
/// Return all layers that overlap the given bounding box.
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<Vec<Self::Layer>>;
|
||||
|
||||
async fn get_keyspace(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn: Lsn,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
|
||||
|
||||
/// NB: This is a pretty expensive operation. In the real pageserver
|
||||
/// implementation, it downloads the layer, and keeps it resident
|
||||
/// until the DeltaLayer is dropped.
|
||||
async fn downcast_delta_layer(
|
||||
&self,
|
||||
layer: &Self::Layer,
|
||||
) -> anyhow::Result<Option<Self::DeltaLayer>>;
|
||||
|
||||
// ----
|
||||
// Functions to execute the plan
|
||||
// ----
|
||||
|
||||
/// Create a new image layer, materializing all the values in the key range,
|
||||
/// at given 'lsn'.
|
||||
async fn create_image(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Self::Key>,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Create a new delta layer, containing all the values from 'input_layers'
|
||||
/// in the given key and LSN range.
|
||||
async fn create_delta(
|
||||
&mut self,
|
||||
lsn_range: &Range<Lsn>,
|
||||
key_range: &Range<Self::Key>,
|
||||
input_layers: &[Self::DeltaLayer],
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Delete a layer. The compaction implementation will call this only after
|
||||
/// all the create_image() or create_delta() calls that deletion of this
|
||||
/// layer depends on have finished. But if the implementor has extra lazy
|
||||
/// background tasks, like uploading the index json file to remote storage,
|
||||
/// it is the implementation's responsibility to track those.
|
||||
async fn delete_layer(
|
||||
&mut self,
|
||||
layer: &Self::Layer,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
|
||||
const MIN: Self;
|
||||
const MAX: Self;
|
||||
|
||||
/// Calculate distance between key_range.start and key_range.end.
|
||||
///
|
||||
/// This returns u32, for compatibility with Repository::key. If the
|
||||
/// distance is larger, return u32::MAX.
|
||||
fn key_range_size(key_range: &Range<Self>) -> u32;
|
||||
|
||||
// return "self + 1"
|
||||
fn next(&self) -> Self;
|
||||
|
||||
// return "self + <some decent amount to skip>". The amount to skip
|
||||
// is left to the implementation.
|
||||
// FIXME: why not just "add(u32)" ? This is hard to use
|
||||
fn skip_some(&self) -> Self;
|
||||
}
|
||||
|
||||
impl CompactionKey for Key {
|
||||
const MIN: Self = Self::MIN;
|
||||
const MAX: Self = Self::MAX;
|
||||
|
||||
fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
|
||||
key_range_size(r)
|
||||
}
|
||||
fn next(&self) -> Key {
|
||||
(self as &Key).next()
|
||||
}
|
||||
fn skip_some(&self) -> Key {
|
||||
self.add(128)
|
||||
}
|
||||
}
|
||||
|
||||
/// Contiguous ranges of keys that belong to the key space. In key order, and
|
||||
/// with no overlap.
|
||||
pub type CompactionKeySpace<K> = Vec<Range<K>>;
|
||||
|
||||
/// Functions needed from all layers.
|
||||
pub trait CompactionLayer<K: CompactionKey + ?Sized> {
|
||||
fn key_range(&self) -> &Range<K>;
|
||||
fn lsn_range(&self) -> &Range<Lsn>;
|
||||
|
||||
fn file_size(&self) -> u64;
|
||||
|
||||
/// For debugging, short human-readable representation of the layer. E.g. filename.
|
||||
fn short_id(&self) -> String;
|
||||
|
||||
fn is_delta(&self) -> bool;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
|
||||
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
|
||||
where
|
||||
Self: 'a;
|
||||
|
||||
/// Return all keys in this delta layer.
|
||||
async fn load_keys<'a>(
|
||||
&self,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
|
||||
}
|
||||
|
||||
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
|
||||
|
||||
pub trait CompactionDeltaEntry<'a, K> {
|
||||
fn key(&self) -> K;
|
||||
fn lsn(&self) -> Lsn;
|
||||
fn size(&self) -> u64;
|
||||
}
|
||||
|
||||
pub trait CompactionRequestContext {}
|
||||
12
pageserver/compaction/src/lib.rs
Normal file
12
pageserver/compaction/src/lib.rs
Normal file
@@ -0,0 +1,12 @@
|
||||
// The main module implementing the compaction algorithm
|
||||
pub mod compact_tiered;
|
||||
pub(crate) mod identify_levels;
|
||||
|
||||
// Traits that the caller of the compaction needs to implement
|
||||
pub mod interface;
|
||||
|
||||
// Utility functions, useful for the implementation
|
||||
pub mod helpers;
|
||||
|
||||
// A simulator with mock implementations of 'interface'
|
||||
pub mod simulator;
|
||||
613
pageserver/compaction/src/simulator.rs
Normal file
613
pageserver/compaction/src/simulator.rs
Normal file
@@ -0,0 +1,613 @@
|
||||
mod draw;
|
||||
|
||||
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use futures::StreamExt;
|
||||
use rand::Rng;
|
||||
use tracing::info;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use std::fmt::Write;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::helpers::{merge_delta_keys, overlaps_with};
|
||||
|
||||
use crate::interface;
|
||||
use crate::interface::CompactionLayer;
|
||||
|
||||
//
|
||||
// Implementation for the CompactionExecutor interface
|
||||
//
|
||||
pub struct MockTimeline {
|
||||
// Parameters for the compaction algorithm
|
||||
pub target_file_size: u64,
|
||||
tiers_per_level: u64,
|
||||
|
||||
num_l0_flushes: u64,
|
||||
last_compact_at_flush: u64,
|
||||
last_flush_lsn: Lsn,
|
||||
|
||||
// In-memory layer
|
||||
records: Vec<MockRecord>,
|
||||
total_len: u64,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
|
||||
// Current keyspace at `end_lsn`. This is updated on every ingested record.
|
||||
keyspace: KeySpace,
|
||||
|
||||
// historic keyspaces
|
||||
old_keyspaces: Vec<(Lsn, KeySpace)>,
|
||||
|
||||
// "on-disk" layers
|
||||
pub live_layers: Vec<MockLayer>,
|
||||
|
||||
num_deleted_layers: u64,
|
||||
|
||||
// Statistics
|
||||
wal_ingested: u64,
|
||||
bytes_written: u64,
|
||||
bytes_deleted: u64,
|
||||
layers_created: u64,
|
||||
layers_deleted: u64,
|
||||
|
||||
// All the events - creation and deletion of files - are collected
|
||||
// in 'history'. It is used to draw the SVG animation at the end.
|
||||
time: u64,
|
||||
history: Vec<draw::LayerTraceEvent>,
|
||||
}
|
||||
|
||||
type KeySpace = interface::CompactionKeySpace<Key>;
|
||||
|
||||
pub struct MockRequestContext {}
|
||||
impl interface::CompactionRequestContext for MockRequestContext {}
|
||||
|
||||
pub type Key = u64;
|
||||
|
||||
impl interface::CompactionKey for Key {
|
||||
const MIN: Self = u64::MIN;
|
||||
const MAX: Self = u64::MAX;
|
||||
|
||||
fn key_range_size(key_range: &Range<Self>) -> u32 {
|
||||
std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
|
||||
}
|
||||
|
||||
fn next(&self) -> Self {
|
||||
self + 1
|
||||
}
|
||||
fn skip_some(&self) -> Self {
|
||||
// round up to next xx
|
||||
self + 100
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct MockRecord {
|
||||
lsn: Lsn,
|
||||
key: Key,
|
||||
len: u64,
|
||||
}
|
||||
|
||||
impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
|
||||
fn key(&self) -> Key {
|
||||
self.key
|
||||
}
|
||||
fn lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
fn size(&self) -> u64 {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockDeltaLayer {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
pub file_size: u64,
|
||||
|
||||
pub deleted: Mutex<bool>,
|
||||
|
||||
pub records: Vec<MockRecord>,
|
||||
}
|
||||
|
||||
impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.key_range
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.lsn_range
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
format!(
|
||||
"{:016X}-{:016X}__{:08X}-{:08X}",
|
||||
self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
|
||||
)
|
||||
}
|
||||
|
||||
fn is_delta(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
|
||||
type DeltaEntry<'a> = MockRecord;
|
||||
|
||||
async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
|
||||
Ok(self.records.clone())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockImageLayer {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
pub file_size: u64,
|
||||
|
||||
pub deleted: Mutex<bool>,
|
||||
}
|
||||
|
||||
impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
|
||||
|
||||
impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.key_range
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.lsn_range
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
format!(
|
||||
"{:016X}-{:016X}__{:08X}",
|
||||
self.key_range.start, self.key_range.end, self.lsn_range.start.0,
|
||||
)
|
||||
}
|
||||
|
||||
fn is_delta(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl MockTimeline {
|
||||
pub fn new() -> Self {
|
||||
MockTimeline {
|
||||
target_file_size: 256 * 1024 * 1024,
|
||||
tiers_per_level: 4,
|
||||
|
||||
num_l0_flushes: 0,
|
||||
last_compact_at_flush: 0,
|
||||
last_flush_lsn: Lsn(0),
|
||||
|
||||
records: Vec::new(),
|
||||
total_len: 0,
|
||||
start_lsn: Lsn(1000),
|
||||
end_lsn: Lsn(1000),
|
||||
keyspace: KeySpace::new(),
|
||||
|
||||
old_keyspaces: vec![],
|
||||
|
||||
live_layers: vec![],
|
||||
|
||||
num_deleted_layers: 0,
|
||||
|
||||
wal_ingested: 0,
|
||||
bytes_written: 0,
|
||||
bytes_deleted: 0,
|
||||
layers_created: 0,
|
||||
layers_deleted: 0,
|
||||
|
||||
time: 0,
|
||||
history: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn compact(&mut self) -> anyhow::Result<()> {
|
||||
let ctx = MockRequestContext {};
|
||||
|
||||
crate::compact_tiered::compact_tiered(
|
||||
self,
|
||||
self.last_flush_lsn,
|
||||
self.target_file_size,
|
||||
self.tiers_per_level,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Ingest one record to the timeline
|
||||
pub fn ingest_record(&mut self, key: Key, len: u64) {
|
||||
self.records.push(MockRecord {
|
||||
lsn: self.end_lsn,
|
||||
key,
|
||||
len,
|
||||
});
|
||||
self.total_len += len;
|
||||
self.end_lsn += len;
|
||||
|
||||
if self.total_len > self.target_file_size {
|
||||
self.flush_l0();
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
|
||||
if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
|
||||
self.compact().await?;
|
||||
self.last_compact_at_flush = self.num_l0_flushes;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush_l0(&mut self) {
|
||||
if self.records.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut records = std::mem::take(&mut self.records);
|
||||
records.sort_by_key(|rec| rec.key);
|
||||
|
||||
let lsn_range = self.start_lsn..self.end_lsn;
|
||||
let new_layer = Arc::new(MockDeltaLayer {
|
||||
key_range: Key::MIN..Key::MAX,
|
||||
lsn_range: lsn_range.clone(),
|
||||
file_size: self.total_len,
|
||||
records,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!("flushed L0 layer {}", new_layer.short_id());
|
||||
self.live_layers.push(MockLayer::from(&new_layer));
|
||||
|
||||
// reset L0
|
||||
self.start_lsn = self.end_lsn;
|
||||
self.total_len = 0;
|
||||
self.records = Vec::new();
|
||||
|
||||
self.layers_created += 1;
|
||||
self.bytes_written += new_layer.file_size;
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::Flush,
|
||||
file: LayerTraceFile {
|
||||
filename: new_layer.short_id(),
|
||||
key_range: new_layer.key_range.clone(),
|
||||
lsn_range: new_layer.lsn_range.clone(),
|
||||
},
|
||||
});
|
||||
|
||||
self.num_l0_flushes += 1;
|
||||
self.last_flush_lsn = self.end_lsn;
|
||||
}
|
||||
|
||||
// Ingest `num_records' records to the timeline, with random keys
|
||||
// uniformly distributed in `key_range`
|
||||
pub fn ingest_uniform(
|
||||
&mut self,
|
||||
num_records: u64,
|
||||
len: u64,
|
||||
key_range: &Range<Key>,
|
||||
) -> anyhow::Result<()> {
|
||||
crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..num_records {
|
||||
self.ingest_record(rng.gen_range(key_range.clone()), len);
|
||||
self.wal_ingested += len;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stats(&self) -> anyhow::Result<String> {
|
||||
let mut s = String::new();
|
||||
|
||||
writeln!(s, "STATISTICS:")?;
|
||||
writeln!(
|
||||
s,
|
||||
"WAL ingested: {:>10} MB",
|
||||
self.wal_ingested / (1024 * 1024)
|
||||
)?;
|
||||
writeln!(
|
||||
s,
|
||||
"size created: {:>10} MB",
|
||||
self.bytes_written / (1024 * 1024)
|
||||
)?;
|
||||
writeln!(
|
||||
s,
|
||||
"size deleted: {:>10} MB",
|
||||
self.bytes_deleted / (1024 * 1024)
|
||||
)?;
|
||||
writeln!(s, "files created: {:>10}", self.layers_created)?;
|
||||
writeln!(s, "files deleted: {:>10}", self.layers_deleted)?;
|
||||
writeln!(
|
||||
s,
|
||||
"write amp: {:>10.2}",
|
||||
self.bytes_written as f64 / self.wal_ingested as f64
|
||||
)?;
|
||||
writeln!(
|
||||
s,
|
||||
"storage amp: {:>10.2}",
|
||||
(self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
|
||||
)?;
|
||||
|
||||
Ok(s)
|
||||
}
|
||||
|
||||
pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
|
||||
draw::draw_history(&self.history, output)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MockTimeline {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum MockLayer {
|
||||
Delta(Arc<MockDeltaLayer>),
|
||||
Image(Arc<MockImageLayer>),
|
||||
}
|
||||
|
||||
impl interface::CompactionLayer<Key> for MockLayer {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.key_range(),
|
||||
MockLayer::Image(this) => this.key_range(),
|
||||
}
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.lsn_range(),
|
||||
MockLayer::Image(this) => this.lsn_range(),
|
||||
}
|
||||
}
|
||||
fn file_size(&self) -> u64 {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.file_size(),
|
||||
MockLayer::Image(this) => this.file_size(),
|
||||
}
|
||||
}
|
||||
fn short_id(&self) -> String {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.short_id(),
|
||||
MockLayer::Image(this) => this.short_id(),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_delta(&self) -> bool {
|
||||
match self {
|
||||
MockLayer::Delta(_) => true,
|
||||
MockLayer::Image(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MockLayer {
|
||||
fn is_deleted(&self) -> bool {
|
||||
let guard = match self {
|
||||
MockLayer::Delta(this) => this.deleted.lock().unwrap(),
|
||||
MockLayer::Image(this) => this.deleted.lock().unwrap(),
|
||||
};
|
||||
*guard
|
||||
}
|
||||
fn mark_deleted(&self) {
|
||||
let mut deleted_guard = match self {
|
||||
MockLayer::Delta(this) => this.deleted.lock().unwrap(),
|
||||
MockLayer::Image(this) => this.deleted.lock().unwrap(),
|
||||
};
|
||||
assert!(!*deleted_guard, "layer already deleted");
|
||||
*deleted_guard = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Arc<MockDeltaLayer>> for MockLayer {
|
||||
fn from(l: &Arc<MockDeltaLayer>) -> Self {
|
||||
MockLayer::Delta(l.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Arc<MockImageLayer>> for MockLayer {
|
||||
fn from(l: &Arc<MockImageLayer>) -> Self {
|
||||
MockLayer::Image(l.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl interface::CompactionJobExecutor for MockTimeline {
|
||||
type Key = Key;
|
||||
type Layer = MockLayer;
|
||||
type DeltaLayer = Arc<MockDeltaLayer>;
|
||||
type ImageLayer = Arc<MockImageLayer>;
|
||||
type RequestContext = MockRequestContext;
|
||||
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
_ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<Vec<Self::Layer>> {
|
||||
// Clear any deleted layers from our vec
|
||||
self.live_layers.retain(|l| !l.is_deleted());
|
||||
|
||||
let layers: Vec<MockLayer> = self
|
||||
.live_layers
|
||||
.iter()
|
||||
.filter(|l| {
|
||||
overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
Ok(layers)
|
||||
}
|
||||
|
||||
async fn get_keyspace(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
_lsn: Lsn,
|
||||
_ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
|
||||
// find it in the levels
|
||||
if self.old_keyspaces.is_empty() {
|
||||
Ok(crate::helpers::intersect_keyspace(
|
||||
&self.keyspace,
|
||||
key_range,
|
||||
))
|
||||
} else {
|
||||
// not implemented
|
||||
|
||||
// The mock implementation only allows requesting the
|
||||
// keyspace at the level's end LSN. That's all that the
|
||||
// current implementation needs.
|
||||
panic!("keyspace not available for requested lsn");
|
||||
}
|
||||
}
|
||||
|
||||
async fn downcast_delta_layer(
|
||||
&self,
|
||||
layer: &MockLayer,
|
||||
) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
|
||||
Ok(match layer {
|
||||
MockLayer::Delta(l) => Some(l.clone()),
|
||||
MockLayer::Image(_) => None,
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_image(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Key>,
|
||||
ctx: &MockRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
|
||||
|
||||
let mut accum_size: u64 = 0;
|
||||
for r in keyspace {
|
||||
accum_size += r.end - r.start;
|
||||
}
|
||||
|
||||
let new_layer = Arc::new(MockImageLayer {
|
||||
key_range: key_range.clone(),
|
||||
lsn_range: lsn..lsn,
|
||||
file_size: accum_size * 8192,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!(
|
||||
"created image layer, size {}: {}",
|
||||
new_layer.file_size,
|
||||
new_layer.short_id()
|
||||
);
|
||||
self.live_layers.push(MockLayer::Image(new_layer.clone()));
|
||||
|
||||
// update stats
|
||||
self.bytes_written += new_layer.file_size;
|
||||
self.layers_created += 1;
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::CreateImage,
|
||||
file: LayerTraceFile {
|
||||
filename: new_layer.short_id(),
|
||||
key_range: new_layer.key_range.clone(),
|
||||
lsn_range: new_layer.lsn_range.clone(),
|
||||
},
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_delta(
|
||||
&mut self,
|
||||
lsn_range: &Range<Lsn>,
|
||||
key_range: &Range<Key>,
|
||||
input_layers: &[Arc<MockDeltaLayer>],
|
||||
ctx: &MockRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut key_value_stream =
|
||||
std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
|
||||
let mut records: Vec<MockRecord> = Vec::new();
|
||||
let mut total_len = 2;
|
||||
while let Some(delta_entry) = key_value_stream.next().await {
|
||||
let delta_entry: MockRecord = delta_entry?;
|
||||
if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
|
||||
total_len += delta_entry.len;
|
||||
records.push(delta_entry);
|
||||
}
|
||||
}
|
||||
let total_records = records.len();
|
||||
let new_layer = Arc::new(MockDeltaLayer {
|
||||
key_range: key_range.clone(),
|
||||
lsn_range: lsn_range.clone(),
|
||||
file_size: total_len,
|
||||
records,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!(
|
||||
"created delta layer, recs {}, size {}: {}",
|
||||
total_records,
|
||||
total_len,
|
||||
new_layer.short_id()
|
||||
);
|
||||
self.live_layers.push(MockLayer::Delta(new_layer.clone()));
|
||||
|
||||
// update stats
|
||||
self.bytes_written += total_len;
|
||||
self.layers_created += 1;
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::CreateDelta,
|
||||
file: LayerTraceFile {
|
||||
filename: new_layer.short_id(),
|
||||
key_range: new_layer.key_range.clone(),
|
||||
lsn_range: new_layer.lsn_range.clone(),
|
||||
},
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_layer(
|
||||
&mut self,
|
||||
layer: &Self::Layer,
|
||||
_ctx: &MockRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let layer = std::pin::pin!(layer);
|
||||
info!("deleting layer: {}", layer.short_id());
|
||||
self.num_deleted_layers += 1;
|
||||
self.bytes_deleted += layer.file_size();
|
||||
layer.mark_deleted();
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::Delete,
|
||||
file: LayerTraceFile {
|
||||
filename: layer.short_id(),
|
||||
key_range: layer.key_range().clone(),
|
||||
lsn_range: layer.lsn_range().clone(),
|
||||
},
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
411
pageserver/compaction/src/simulator/draw.rs
Normal file
411
pageserver/compaction/src/simulator/draw.rs
Normal file
@@ -0,0 +1,411 @@
|
||||
use super::Key;
|
||||
use anyhow::Result;
|
||||
use std::cmp::Ordering;
|
||||
use std::{
|
||||
collections::{BTreeMap, BTreeSet, HashSet},
|
||||
fmt::Write,
|
||||
ops::Range,
|
||||
};
|
||||
use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
// Map values to their compressed coordinate - the index the value
|
||||
// would have in a sorted and deduplicated list of all values.
|
||||
struct CoordinateMap<T: Ord + Copy> {
|
||||
map: BTreeMap<T, usize>,
|
||||
stretch: f32,
|
||||
}
|
||||
|
||||
impl<T: Ord + Copy> CoordinateMap<T> {
|
||||
fn new(coords: Vec<T>, stretch: f32) -> Self {
|
||||
let set: BTreeSet<T> = coords.into_iter().collect();
|
||||
|
||||
let mut map: BTreeMap<T, usize> = BTreeMap::new();
|
||||
for (i, e) in set.iter().enumerate() {
|
||||
map.insert(*e, i);
|
||||
}
|
||||
|
||||
Self { map, stretch }
|
||||
}
|
||||
|
||||
// This assumes that the map contains an exact point for this.
|
||||
// Use map_inexact for values inbetween
|
||||
fn map(&self, val: T) -> f32 {
|
||||
*self.map.get(&val).unwrap() as f32 * self.stretch
|
||||
}
|
||||
|
||||
// the value is still assumed to be within the min/max bounds
|
||||
// (this is currently unused)
|
||||
fn _map_inexact(&self, val: T) -> f32 {
|
||||
let prev = *self.map.range(..=val).next().unwrap().1;
|
||||
let next = *self.map.range(val..).next().unwrap().1;
|
||||
|
||||
// interpolate
|
||||
(prev as f32 + (next - prev) as f32) * self.stretch
|
||||
}
|
||||
|
||||
fn max(&self) -> f32 {
|
||||
self.map.len() as f32 * self.stretch
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Hash, Eq)]
|
||||
pub enum LayerTraceOp {
|
||||
Flush,
|
||||
CreateDelta,
|
||||
CreateImage,
|
||||
Delete,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for LayerTraceOp {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
let op_str = match self {
|
||||
LayerTraceOp::Flush => "flush",
|
||||
LayerTraceOp::CreateDelta => "create_delta",
|
||||
LayerTraceOp::CreateImage => "create_image",
|
||||
LayerTraceOp::Delete => "delete",
|
||||
};
|
||||
f.write_str(op_str)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Hash, Eq, Clone)]
|
||||
pub struct LayerTraceFile {
|
||||
pub filename: String,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl LayerTraceFile {
|
||||
fn is_image(&self) -> bool {
|
||||
self.lsn_range.end == self.lsn_range.start
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayerTraceEvent {
|
||||
pub time_rel: u64,
|
||||
pub op: LayerTraceOp,
|
||||
pub file: LayerTraceFile,
|
||||
}
|
||||
|
||||
pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
|
||||
let mut files: Vec<LayerTraceFile> = Vec::new();
|
||||
|
||||
for event in history {
|
||||
files.push(event.file.clone());
|
||||
}
|
||||
let last_time_rel = history.last().unwrap().time_rel;
|
||||
|
||||
// Collect all coordinates
|
||||
let mut keys: Vec<Key> = vec![];
|
||||
let mut lsns: Vec<Lsn> = vec![];
|
||||
for f in files.iter() {
|
||||
keys.push(f.key_range.start);
|
||||
keys.push(f.key_range.end);
|
||||
lsns.push(f.lsn_range.start);
|
||||
lsns.push(f.lsn_range.end);
|
||||
}
|
||||
|
||||
// Analyze
|
||||
let key_map = CoordinateMap::new(keys, 2.0);
|
||||
// Stretch out vertically for better visibility
|
||||
let lsn_map = CoordinateMap::new(lsns, 3.0);
|
||||
|
||||
let mut svg = String::new();
|
||||
|
||||
// Draw
|
||||
writeln!(
|
||||
svg,
|
||||
"{}",
|
||||
BeginSvg {
|
||||
w: key_map.max(),
|
||||
h: lsn_map.max(),
|
||||
}
|
||||
)?;
|
||||
let lsn_max = lsn_map.max();
|
||||
|
||||
// Sort the files by LSN, but so that image layers go after all delta layers
|
||||
// The SVG is painted in the order the elements appear, and we want to draw
|
||||
// image layers on top of the delta layers if they overlap
|
||||
//
|
||||
// (This could also be implemented via z coordinates: image layers get one z
|
||||
// coord, delta layers get another z coord.)
|
||||
let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
|
||||
files_sorted.sort_by(|a, b| {
|
||||
if a.is_image() && !b.is_image() {
|
||||
Ordering::Greater
|
||||
} else if !a.is_image() && b.is_image() {
|
||||
Ordering::Less
|
||||
} else {
|
||||
a.lsn_range.end.cmp(&b.lsn_range.end)
|
||||
}
|
||||
});
|
||||
|
||||
writeln!(svg, "<!-- layers -->")?;
|
||||
let mut files_seen = HashSet::new();
|
||||
for f in files_sorted {
|
||||
if files_seen.contains(&f) {
|
||||
continue;
|
||||
}
|
||||
let key_start = key_map.map(f.key_range.start);
|
||||
let key_end = key_map.map(f.key_range.end);
|
||||
let key_diff = key_end - key_start;
|
||||
|
||||
if key_start >= key_end {
|
||||
panic!("Invalid key range {}-{}", key_start, key_end);
|
||||
}
|
||||
|
||||
let lsn_start = lsn_map.map(f.lsn_range.start);
|
||||
let lsn_end = lsn_map.map(f.lsn_range.end);
|
||||
|
||||
// Fill in and thicken rectangle if it's an
|
||||
// image layer so that we can see it.
|
||||
let mut style = Style::default();
|
||||
style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
|
||||
style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
|
||||
|
||||
let y_start = lsn_max - lsn_start;
|
||||
let y_end = lsn_max - lsn_end;
|
||||
|
||||
let x_margin = 0.25;
|
||||
let y_margin = 0.5;
|
||||
|
||||
match f.lsn_range.start.cmp(&f.lsn_range.end) {
|
||||
Ordering::Less => {
|
||||
write!(
|
||||
svg,
|
||||
r#" <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
|
||||
f.filename,
|
||||
key_start + x_margin,
|
||||
y_end + y_margin,
|
||||
key_diff - x_margin * 2.0,
|
||||
y_start - y_end - y_margin * 2.0,
|
||||
1.0, // border_radius,
|
||||
style,
|
||||
)?;
|
||||
write!(svg, "<title>{}</title>", f.filename)?;
|
||||
writeln!(svg, "</rect>")?;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
//lsn_diff = 0.3;
|
||||
//lsn_offset = -lsn_diff / 2.0;
|
||||
//margin = 0.05;
|
||||
style.fill = Fill::Color(rgb(0x80, 0, 0x80));
|
||||
style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
|
||||
write!(
|
||||
svg,
|
||||
r#" <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
|
||||
f.filename,
|
||||
key_start + x_margin,
|
||||
y_end,
|
||||
key_end - x_margin,
|
||||
y_end,
|
||||
style,
|
||||
)?;
|
||||
write!(
|
||||
svg,
|
||||
"<title>{}<br>{} - {}</title>",
|
||||
f.filename, lsn_end, y_end
|
||||
)?;
|
||||
writeln!(svg, "</line>")?;
|
||||
}
|
||||
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
|
||||
}
|
||||
files_seen.insert(f);
|
||||
}
|
||||
|
||||
let mut record_style = Style::default();
|
||||
record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
|
||||
record_style.stroke = Stroke::None;
|
||||
|
||||
writeln!(svg, "{}", EndSvg)?;
|
||||
|
||||
let mut layer_events_str = String::new();
|
||||
let mut first = true;
|
||||
for e in history {
|
||||
if !first {
|
||||
writeln!(layer_events_str, ",")?;
|
||||
}
|
||||
write!(
|
||||
layer_events_str,
|
||||
r#" {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
|
||||
e.time_rel, e.file.filename, e.op
|
||||
)?;
|
||||
first = false;
|
||||
}
|
||||
writeln!(layer_events_str)?;
|
||||
|
||||
writeln!(
|
||||
output,
|
||||
r#"<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
/* Keep the slider pinned at top */
|
||||
.topbar {{
|
||||
display: block;
|
||||
overflow: hidden;
|
||||
background-color: lightgrey;
|
||||
position: fixed;
|
||||
top: 0;
|
||||
width: 100%;
|
||||
/* width: 500px; */
|
||||
}}
|
||||
.slidercontainer {{
|
||||
float: left;
|
||||
width: 50%;
|
||||
margin-right: 200px;
|
||||
}}
|
||||
.slider {{
|
||||
float: left;
|
||||
width: 100%;
|
||||
}}
|
||||
.legend {{
|
||||
width: 200px;
|
||||
float: right;
|
||||
}}
|
||||
|
||||
/* Main content */
|
||||
.main {{
|
||||
margin-top: 50px; /* Add a top margin to avoid content overlay */
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body onload="init()">
|
||||
<script type="text/javascript">
|
||||
|
||||
var layer_events = [{layer_events_str}]
|
||||
|
||||
let ticker;
|
||||
|
||||
function init() {{
|
||||
for (let i = 0; i < layer_events.length; i++) {{
|
||||
var layer = document.getElementById("layer_" + layer_events[i].filename);
|
||||
layer.style.visibility = "hidden";
|
||||
}}
|
||||
last_layer_event = -1;
|
||||
moveSlider(last_slider_pos)
|
||||
}}
|
||||
|
||||
function startAnimation() {{
|
||||
ticker = setInterval(animateStep, 100);
|
||||
}}
|
||||
function stopAnimation() {{
|
||||
clearInterval(ticker);
|
||||
}}
|
||||
|
||||
function animateStep() {{
|
||||
if (last_layer_event < layer_events.length - 1) {{
|
||||
var slider = document.getElementById("time-slider");
|
||||
let prevPos = slider.value
|
||||
let nextEvent = last_layer_event + 1
|
||||
while (nextEvent <= layer_events.length - 1) {{
|
||||
if (layer_events[nextEvent].time_rel > prevPos) {{
|
||||
break;
|
||||
}}
|
||||
nextEvent += 1;
|
||||
}}
|
||||
let nextPos = layer_events[nextEvent].time_rel
|
||||
slider.value = nextPos
|
||||
moveSlider(nextPos)
|
||||
}}
|
||||
}}
|
||||
|
||||
function redoLayerEvent(n, dir) {{
|
||||
var layer = document.getElementById("layer_" + layer_events[n].filename);
|
||||
switch (layer_events[n].op) {{
|
||||
case "flush":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
case "create_delta":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
case "create_image":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
case "delete":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
}}
|
||||
}}
|
||||
function undoLayerEvent(n) {{
|
||||
var layer = document.getElementById("layer_" + layer_events[n].filename);
|
||||
switch (layer_events[n].op) {{
|
||||
case "flush":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
case "create_delta":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
case "create_image":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
case "delete":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
}}
|
||||
}}
|
||||
|
||||
var last_slider_pos = 0
|
||||
var last_layer_event = 0
|
||||
|
||||
var moveSlider = function(new_pos) {{
|
||||
if (new_pos > last_slider_pos) {{
|
||||
while (last_layer_event < layer_events.length - 1) {{
|
||||
if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
|
||||
break;
|
||||
}}
|
||||
last_layer_event += 1;
|
||||
redoLayerEvent(last_layer_event)
|
||||
}}
|
||||
}}
|
||||
if (new_pos < last_slider_pos) {{
|
||||
while (last_layer_event >= 0) {{
|
||||
if (layer_events[last_layer_event].time_rel <= new_pos) {{
|
||||
break;
|
||||
}}
|
||||
undoLayerEvent(last_layer_event)
|
||||
last_layer_event -= 1;
|
||||
}}
|
||||
}}
|
||||
last_slider_pos = new_pos;
|
||||
document.getElementById("debug_pos").textContent=new_pos;
|
||||
if (last_layer_event >= 0) {{
|
||||
document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
|
||||
}} else {{
|
||||
document.getElementById("debug_layer_event").textContent="begin";
|
||||
}}
|
||||
}}
|
||||
</script>
|
||||
|
||||
<div class="topbar">
|
||||
<div class="slidercontainer">
|
||||
<label for="time-slider">TIME</label>:
|
||||
<input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
|
||||
|
||||
pos: <span id="debug_pos"></span><br>
|
||||
event: <span id="debug_layer_event"></span><br>
|
||||
gc: <span id="debug_gc_event"></span><br>
|
||||
</div>
|
||||
|
||||
<button onclick="startAnimation()">Play</button>
|
||||
<button onclick="stopAnimation()">Stop</button>
|
||||
|
||||
<svg class="legend">
|
||||
<rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
|
||||
<line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
|
||||
<line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
|
||||
</svg>
|
||||
</div>
|
||||
|
||||
<div class="main">
|
||||
{svg}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"#
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
35
pageserver/compaction/tests/tests.rs
Normal file
35
pageserver/compaction/tests/tests.rs
Normal file
@@ -0,0 +1,35 @@
|
||||
use pageserver_compaction::interface::CompactionLayer;
|
||||
use pageserver_compaction::simulator::MockTimeline;
|
||||
|
||||
/// Test the extreme case that there are so many updates for a single key that
|
||||
/// even if we produce an extremely narrow delta layer, spanning just that one
|
||||
/// key, we still too many records to fit in the target file size. We need to
|
||||
/// split in the LSN dimension too in that case.
|
||||
///
|
||||
/// TODO: The code to avoid this problem has not been implemented yet! So the
|
||||
/// assertion currently fails, but we need to make it not fail.
|
||||
#[ignore]
|
||||
#[tokio::test]
|
||||
async fn test_many_updates_for_single_key() {
|
||||
let mut executor = MockTimeline::new();
|
||||
executor.target_file_size = 10_000_000; // 10 MB
|
||||
|
||||
// Ingest 100 MB of updates to a single key.
|
||||
for _ in 1..1000 {
|
||||
executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
|
||||
executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
|
||||
executor.compact().await.unwrap();
|
||||
}
|
||||
|
||||
// Check that all the layers are smaller than the target size (with some slop)
|
||||
for l in executor.live_layers.iter() {
|
||||
println!("layer {}: {}", l.short_id(), l.file_size());
|
||||
}
|
||||
for l in executor.live_layers.iter() {
|
||||
assert!(l.file_size() < executor.target_file_size * 2);
|
||||
// sanity check that none of the delta layers are stupidly small either
|
||||
if l.is_delta() {
|
||||
assert!(l.file_size() > executor.target_file_size / 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -12,7 +12,7 @@ use std::collections::BinaryHeap;
|
||||
use std::ops::Range;
|
||||
use std::{fs, str};
|
||||
|
||||
use pageserver::page_cache::PAGE_SZ;
|
||||
use pageserver::page_cache::{self, PAGE_SZ};
|
||||
use pageserver::repository::{Key, KEY_SIZE};
|
||||
use pageserver::tenant::block_io::FileBlockReader;
|
||||
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
|
||||
@@ -100,13 +100,15 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
|
||||
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
|
||||
async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
|
||||
let file = FileBlockReader::new(VirtualFile::open(path).await?);
|
||||
let summary_blk = file.read_blk(0, ctx).await?;
|
||||
let file = VirtualFile::open(path).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
actual_summary.index_start_blk,
|
||||
actual_summary.index_root_blk,
|
||||
file,
|
||||
block_reader,
|
||||
);
|
||||
// min-heap (reserve space for one more element added before eviction)
|
||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
||||
|
||||
@@ -61,13 +61,15 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
page_cache::init(100);
|
||||
let file = FileBlockReader::new(VirtualFile::open(path).await?);
|
||||
let summary_blk = file.read_blk(0, ctx).await?;
|
||||
let file = VirtualFile::open(path).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
actual_summary.index_start_blk,
|
||||
actual_summary.index_root_blk,
|
||||
&file,
|
||||
&block_reader,
|
||||
);
|
||||
// TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
|
||||
let mut all = vec![];
|
||||
@@ -83,7 +85,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let cursor = BlockCursor::new_fileblockreader(&file);
|
||||
let cursor = BlockCursor::new_fileblockreader(&block_reader);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos(), ctx).await?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
|
||||
@@ -8,7 +8,7 @@ use utils::lsn::Lsn;
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, info, instrument};
|
||||
use tracing::{info, instrument};
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
@@ -25,8 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "localhost:64000")]
|
||||
page_service_host_port: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
@@ -230,12 +230,9 @@ async fn client(
|
||||
) {
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
|
||||
&args.page_service_host_port,
|
||||
args.pageserver_jwt.as_deref(),
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
@@ -263,7 +260,7 @@ async fn client(
|
||||
}
|
||||
})
|
||||
.await;
|
||||
debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
|
||||
info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
|
||||
@@ -3,7 +3,6 @@ use utils::logging;
|
||||
|
||||
/// Re-usable pieces of code that aren't CLI-specific.
|
||||
mod util {
|
||||
pub(crate) mod connstring;
|
||||
pub(crate) mod request_stats;
|
||||
#[macro_use]
|
||||
pub(crate) mod tokio_thread_local_stats;
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
|
||||
let colon_and_jwt = if let Some(jwt) = jwt {
|
||||
format!(":{jwt}") // TODO: urlescape
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
format!("postgres://postgres{colon_and_jwt}@{host_port}")
|
||||
}
|
||||
@@ -14,7 +14,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
|
||||
}
|
||||
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
|
||||
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
|
||||
(Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
|
||||
(Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Pageserver auth",
|
||||
claims.scope
|
||||
|
||||
@@ -143,6 +143,7 @@ where
|
||||
ar: &'a mut Builder<&'b mut W>,
|
||||
buf: Vec<u8>,
|
||||
current_segment: Option<(SlruKind, u32)>,
|
||||
total_blocks: usize,
|
||||
}
|
||||
|
||||
impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
|
||||
@@ -154,6 +155,7 @@ where
|
||||
ar,
|
||||
buf: Vec::new(),
|
||||
current_segment: None,
|
||||
total_blocks: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -199,7 +201,8 @@ where
|
||||
let header = new_tar_header(&segname, self.buf.len() as u64)?;
|
||||
self.ar.append(&header, self.buf.as_slice()).await?;
|
||||
|
||||
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
self.total_blocks += nblocks;
|
||||
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
|
||||
self.buf.clear();
|
||||
|
||||
@@ -207,11 +210,15 @@ where
|
||||
}
|
||||
|
||||
async fn finish(mut self) -> anyhow::Result<()> {
|
||||
if self.current_segment.is_none() || self.buf.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let res = if self.current_segment.is_none() || self.buf.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
self.flush().await
|
||||
};
|
||||
|
||||
self.flush().await
|
||||
info!("Collected {} SLRU blocks", self.total_blocks);
|
||||
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -34,12 +34,13 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::timeline::GetVectoredImpl;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::virtual_file;
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
@@ -87,6 +88,10 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -126,6 +131,10 @@ pub mod defaults {
|
||||
|
||||
#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
|
||||
|
||||
#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
|
||||
|
||||
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -140,7 +149,6 @@ pub mod defaults {
|
||||
|
||||
#min_resident_size_override = .. # in bytes
|
||||
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
|
||||
#gc_feedback = false
|
||||
|
||||
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
||||
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
|
||||
@@ -204,9 +212,9 @@ pub struct PageServerConf {
|
||||
|
||||
pub log_format: LogFormat,
|
||||
|
||||
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
|
||||
/// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes
|
||||
/// loading such tenants, vs. other work in the system.
|
||||
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
|
||||
///
|
||||
/// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
|
||||
pub concurrent_tenant_warmup: ConfigurableSemaphore,
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
|
||||
@@ -263,6 +271,10 @@ pub struct PageServerConf {
|
||||
pub virtual_file_io_engine: virtual_file::IoEngineKind,
|
||||
|
||||
pub get_vectored_impl: GetVectoredImpl,
|
||||
|
||||
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||
|
||||
pub validate_vectored_get: bool,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -351,6 +363,10 @@ struct PageServerConfigBuilder {
|
||||
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
|
||||
|
||||
get_vectored_impl: BuilderValue<GetVectoredImpl>,
|
||||
|
||||
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
|
||||
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -430,6 +446,10 @@ impl Default for PageServerConfigBuilder {
|
||||
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
|
||||
|
||||
get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
|
||||
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -594,6 +614,14 @@ impl PageServerConfigBuilder {
|
||||
self.get_vectored_impl = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
|
||||
self.max_vectored_read_bytes = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_validate_vectored_get(&mut self, value: bool) {
|
||||
self.validate_vectored_get = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
@@ -707,6 +735,12 @@ impl PageServerConfigBuilder {
|
||||
get_vectored_impl: self
|
||||
.get_vectored_impl
|
||||
.ok_or(anyhow!("missing get_vectored_impl"))?,
|
||||
max_vectored_read_bytes: self
|
||||
.max_vectored_read_bytes
|
||||
.ok_or(anyhow!("missing max_vectored_read_bytes"))?,
|
||||
validate_vectored_get: self
|
||||
.validate_vectored_get
|
||||
.ok_or(anyhow!("missing validate_vectored_get"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -826,17 +860,6 @@ impl PageServerConf {
|
||||
.join(connection_id.to_string())
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Utf8PathBuf {
|
||||
self.timeline_path(tenant_shard_id, timeline_id)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Turns storage remote path of a file into its local path.
|
||||
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
|
||||
remote_path.with_base(&self.workdir)
|
||||
@@ -964,6 +987,15 @@ impl PageServerConf {
|
||||
"get_vectored_impl" => {
|
||||
builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
|
||||
}
|
||||
"max_vectored_read_bytes" => {
|
||||
let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
|
||||
builder.get_max_vectored_read_bytes(
|
||||
MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
|
||||
}
|
||||
"validate_vectored_get" => {
|
||||
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1039,6 +1071,11 @@ impl PageServerConf {
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||
max_vectored_read_bytes: MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant"),
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1273,6 +1310,11 @@ background_task_maximum_delay = '334 s'
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||
max_vectored_read_bytes: MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1338,6 +1380,11 @@ background_task_maximum_delay = '334 s'
|
||||
ingest_batch_size: 100,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||
max_vectored_read_bytes: MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -1572,17 +1619,50 @@ threshold = "20m"
|
||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
|
||||
})
|
||||
);
|
||||
|
||||
match &conf.default_tenant_conf.eviction_policy {
|
||||
EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"),
|
||||
EvictionPolicy::LayerAccessThreshold(eviction_thresold) => {
|
||||
assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60));
|
||||
assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60));
|
||||
EvictionPolicy::LayerAccessThreshold(eviction_threshold) => {
|
||||
assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60));
|
||||
assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60));
|
||||
}
|
||||
other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_imitation_only_pageserver_config() {
|
||||
let tempdir = tempdir().unwrap();
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap();
|
||||
|
||||
let pageserver_conf_toml = format!(
|
||||
r#"pg_distrib_dir = "{pg_distrib_dir}"
|
||||
metric_collection_endpoint = "http://sample.url"
|
||||
metric_collection_interval = "10min"
|
||||
id = 222
|
||||
|
||||
[tenant_config]
|
||||
evictions_low_residence_duration_metric_threshold = "20m"
|
||||
|
||||
[tenant_config.eviction_policy]
|
||||
kind = "OnlyImitiate"
|
||||
period = "20m"
|
||||
threshold = "20m"
|
||||
"#,
|
||||
);
|
||||
let toml: Document = pageserver_conf_toml.parse().unwrap();
|
||||
let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
|
||||
|
||||
match &conf.default_tenant_conf.eviction_policy {
|
||||
EvictionPolicy::OnlyImitiate(t) => {
|
||||
assert_eq!(t.period, Duration::from_secs(20 * 60));
|
||||
assert_eq!(t.threshold, Duration::from_secs(20 * 60));
|
||||
}
|
||||
other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
|
||||
let tempdir_path = tempdir.path();
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ use tracing::*;
|
||||
use utils::id::NodeId;
|
||||
|
||||
mod metrics;
|
||||
use metrics::MetricsKey;
|
||||
use crate::consumption_metrics::metrics::MetricsKey;
|
||||
mod disk_cache;
|
||||
mod upload;
|
||||
|
||||
|
||||
@@ -2,10 +2,10 @@ use std::collections::HashMap;
|
||||
|
||||
use futures::Future;
|
||||
use pageserver_api::{
|
||||
control_api::{
|
||||
shard::TenantShardId,
|
||||
upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
@@ -234,7 +234,7 @@ impl DeletionHeader {
|
||||
let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
|
||||
let header_path = conf.deletion_header_path();
|
||||
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
|
||||
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
|
||||
VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion header")?;
|
||||
|
||||
@@ -325,7 +325,8 @@ impl DeletionList {
|
||||
let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
|
||||
|
||||
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
|
||||
|
||||
VirtualFile::crashsafe_overwrite(path, temp_path, bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion list")
|
||||
.map_err(Into::into)
|
||||
|
||||
@@ -567,114 +567,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/attach:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
Schedules attach operation to happen in the background for the given tenant.
|
||||
As soon as the caller sends this request, it must assume the pageserver
|
||||
starts writing to the tenant's S3 state unless it receives one of the
|
||||
distinguished errors below that state otherwise.
|
||||
|
||||
If a client receives a not-distinguished response, e.g., a network timeout,
|
||||
it MUST retry the /attach request and poll again for the tenant's
|
||||
attachment status.
|
||||
|
||||
After the client has received a 202, it MUST poll the tenant's
|
||||
attachment status (field `attachment_status`) to reach state `attached`.
|
||||
If the `attachment_status` is missing, the client MUST retry the `/attach`
|
||||
request (goto previous paragraph). This is a robustness measure in case the tenant
|
||||
status endpoint is buggy, but the attach operation is ongoing.
|
||||
|
||||
There is no way to cancel an in-flight request.
|
||||
|
||||
In any case, the client
|
||||
* MUST NOT ASSUME that the /attach request has been lost in the network,
|
||||
* MUST NOT ASSUME that the request has been lost, based on the observation
|
||||
that a subsequent tenant status request returns 404. The request may
|
||||
still be in flight. It must be retried.
|
||||
|
||||
The client SHOULD supply a `TenantConfig` for the tenant in the request body.
|
||||
Settings specified in the config override the pageserver's defaults.
|
||||
It is guaranteed that the config settings are applied before the pageserver
|
||||
starts operating on the tenant. E.g., if the config specifies a specific
|
||||
PITR interval for a tenant, then that setting will be in effect before the
|
||||
pageserver starts the garbage collection loop. This enables a client to
|
||||
guarantee a specific PITR setting across detach/attach cycles.
|
||||
The pageserver will reject the request if it cannot parse the config, or
|
||||
if there are any unknown fields in it.
|
||||
|
||||
If the client does not supply a config, the pageserver will use its defaults.
|
||||
This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282
|
||||
requestBody:
|
||||
required: false
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantAttachRequest"
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant attaching scheduled
|
||||
"400":
|
||||
description: Bad Request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: |
|
||||
The tenant is already known to Pageserver in some way,
|
||||
and hence this `/attach` call has been rejected.
|
||||
|
||||
Some examples of how this can happen:
|
||||
- tenant was created on this pageserver
|
||||
- tenant attachment was started by an earlier call to `/attach`.
|
||||
|
||||
Callers should poll the tenant status's `attachment_status` field,
|
||||
like for status 202. See the longer description for `POST /attach`
|
||||
for details.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/location_config:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -687,6 +579,12 @@ paths:
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
- name: lazy
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
|
||||
put:
|
||||
description: |
|
||||
Configures a _tenant location_, that is how a particular pageserver handles
|
||||
@@ -770,66 +668,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/detach:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: detach_ignored
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
description: |
|
||||
When true, allow to detach a tenant which state is ignored.
|
||||
post:
|
||||
description: |
|
||||
Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
|
||||
Files on the remote storage are not affected.
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant detached
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenant not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/ignore:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -1379,6 +1217,25 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/utilization:
|
||||
get:
|
||||
description: |
|
||||
Returns the pageservers current utilization and fitness score for new tenants.
|
||||
|
||||
responses:
|
||||
"200":
|
||||
description: Pageserver utilization and fitness score
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PageserverUtilization"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
@@ -1445,16 +1302,6 @@ components:
|
||||
generation:
|
||||
type: integer
|
||||
description: Attachment generation number.
|
||||
TenantAttachRequest:
|
||||
type: object
|
||||
required:
|
||||
- config
|
||||
properties:
|
||||
config:
|
||||
$ref: '#/components/schemas/TenantConfig'
|
||||
generation:
|
||||
type: integer
|
||||
description: Attachment generation number.
|
||||
TenantConfigRequest:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/TenantConfig'
|
||||
@@ -1691,6 +1538,33 @@ components:
|
||||
type: string
|
||||
enum: [past, present, future, nodata]
|
||||
|
||||
PageserverUtilization:
|
||||
type: object
|
||||
required:
|
||||
- disk_usage_bytes
|
||||
- free_space_bytes
|
||||
- utilization_score
|
||||
properties:
|
||||
disk_usage_bytes:
|
||||
type: integer
|
||||
format: int64
|
||||
minimum: 0
|
||||
description: The amount of disk space currently utilized by layer files.
|
||||
free_space_bytes:
|
||||
type: integer
|
||||
format: int64
|
||||
minimum: 0
|
||||
description: The amount of usable disk space left.
|
||||
utilization_score:
|
||||
type: integer
|
||||
format: int64
|
||||
minimum: 0
|
||||
maximum: 9223372036854775807
|
||||
default: 9223372036854775807
|
||||
description: |
|
||||
Lower is better score for how good this pageserver would be for the next tenant.
|
||||
The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
|
||||
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -100,6 +100,7 @@ pub struct State {
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
latest_utilization: tokio::sync::Mutex<Option<(std::time::Instant, bytes::Bytes)>>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -128,6 +129,7 @@ impl State {
|
||||
disk_usage_eviction_state,
|
||||
deletion_queue_client,
|
||||
secondary_controller,
|
||||
latest_utilization: Default::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -659,9 +661,14 @@ async fn timeline_detail_handler(
|
||||
|
||||
// Logical size calculation needs downloading.
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline_info = async {
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
@@ -694,6 +701,7 @@ async fn get_lsn_by_timestamp_handler(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
@@ -710,7 +718,10 @@ async fn get_lsn_by_timestamp_handler(
|
||||
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let result = timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
|
||||
.await?;
|
||||
@@ -741,6 +752,7 @@ async fn get_timestamp_of_lsn_handler(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
@@ -757,7 +769,9 @@ async fn get_timestamp_of_lsn_handler(
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
|
||||
|
||||
match result {
|
||||
@@ -802,13 +816,7 @@ async fn tenant_attach_handler(
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.upsert_location(
|
||||
tenant_shard_id,
|
||||
location_conf,
|
||||
None,
|
||||
SpawnMode::Normal,
|
||||
&ctx,
|
||||
)
|
||||
.upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
|
||||
.await?;
|
||||
|
||||
let Some(tenant) = tenant else {
|
||||
@@ -1157,10 +1165,13 @@ async fn layer_map_info_handler(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let reset: LayerAccessStatsReset =
|
||||
parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
|
||||
let state = get_state(&request);
|
||||
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let layer_map_info = timeline.layer_map_info(reset).await;
|
||||
|
||||
json_response(StatusCode::OK, layer_map_info)
|
||||
@@ -1174,8 +1185,11 @@ async fn layer_download_handler(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let layer_file_name = get_request_param(&request, "layer_file_name")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let downloaded = timeline
|
||||
.download_layer(layer_file_name)
|
||||
.await
|
||||
@@ -1199,8 +1213,11 @@ async fn evict_timeline_layer_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let layer_file_name = get_request_param(&request, "layer_file_name")?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let evicted = timeline
|
||||
.evict_layer(layer_file_name)
|
||||
.await
|
||||
@@ -1395,6 +1412,7 @@ async fn put_tenant_location_config_handler(
|
||||
|
||||
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
|
||||
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
|
||||
let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
@@ -1425,15 +1443,17 @@ async fn put_tenant_location_config_handler(
|
||||
let location_conf =
|
||||
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
// lazy==true queues up for activation or jumps the queue like normal when a compute connects,
|
||||
// similar to at startup ordering.
|
||||
let spawn_mode = if lazy {
|
||||
tenant::SpawnMode::Lazy
|
||||
} else {
|
||||
tenant::SpawnMode::Eager
|
||||
};
|
||||
|
||||
let attached = state
|
||||
.tenant_manager
|
||||
.upsert_location(
|
||||
tenant_shard_id,
|
||||
location_conf,
|
||||
flush,
|
||||
tenant::SpawnMode::Normal,
|
||||
&ctx,
|
||||
)
|
||||
.upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
|
||||
.await?
|
||||
.is_some();
|
||||
|
||||
@@ -1610,13 +1630,19 @@ async fn timeline_compact_handler(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let mut flags = EnumSet::empty();
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
|
||||
flags |= CompactFlags::ForceRepartition;
|
||||
}
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
timeline
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
@@ -1636,13 +1662,19 @@ async fn timeline_checkpoint_handler(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let mut flags = EnumSet::empty();
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
|
||||
flags |= CompactFlags::ForceRepartition;
|
||||
}
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
@@ -1667,7 +1699,11 @@ async fn timeline_download_remote_layers_handler_post(
|
||||
let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
match timeline.spawn_download_all_remote_layers(body).await {
|
||||
Ok(st) => json_response(StatusCode::ACCEPTED, st),
|
||||
Err(st) => json_response(StatusCode::CONFLICT, st),
|
||||
@@ -1681,8 +1717,11 @@ async fn timeline_download_remote_layers_handler_get(
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let info = timeline
|
||||
.get_download_all_remote_layers_task_info()
|
||||
.context("task never started since last pageserver process start")
|
||||
@@ -1731,6 +1770,7 @@ async fn getpage_at_lsn_handler(
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
struct Key(crate::repository::Key);
|
||||
|
||||
@@ -1749,7 +1789,7 @@ async fn getpage_at_lsn_handler(
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
|
||||
let page = timeline.get(key.0, lsn, &ctx).await?;
|
||||
|
||||
@@ -1772,12 +1812,13 @@ async fn timeline_collect_keyspace(
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
||||
let keys = timeline
|
||||
.collect_keyspace(at_lsn, &ctx)
|
||||
@@ -1793,10 +1834,14 @@ async fn timeline_collect_keyspace(
|
||||
}
|
||||
|
||||
async fn active_timeline_of_active_tenant(
|
||||
tenant_manager: &TenantManager,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>, ApiError> {
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))
|
||||
@@ -1963,6 +2008,54 @@ async fn put_io_engine_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Polled by control plane.
|
||||
///
|
||||
/// See [`crate::utilization`].
|
||||
async fn get_utilization(
|
||||
r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
// this probably could be completely public, but lets make that change later.
|
||||
check_permission(&r, None)?;
|
||||
|
||||
let state = get_state(&r);
|
||||
let mut g = state.latest_utilization.lock().await;
|
||||
|
||||
let regenerate_every = Duration::from_secs(1);
|
||||
let still_valid = g
|
||||
.as_ref()
|
||||
.is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every);
|
||||
|
||||
// avoid needless statvfs calls even though those should be non-blocking fast.
|
||||
// regenerate at most 1Hz to allow polling at any rate.
|
||||
if !still_valid {
|
||||
let path = state.conf.tenants_path();
|
||||
let doc = crate::utilization::regenerate(path.as_std_path())
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut buf = Vec::new();
|
||||
serde_json::to_writer(&mut buf, &doc)
|
||||
.context("serialize")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let body = bytes::Bytes::from(buf);
|
||||
|
||||
*g = Some((std::time::Instant::now(), body));
|
||||
}
|
||||
|
||||
// hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork
|
||||
let cached = g.as_ref().expect("just set").1.clone();
|
||||
|
||||
Response::builder()
|
||||
.header(hyper::http::header::CONTENT_TYPE, "application/json")
|
||||
// thought of using http date header, but that is second precision which does not give any
|
||||
// debugging aid
|
||||
.status(StatusCode::OK)
|
||||
.body(hyper::Body::from(cached))
|
||||
.context("build response")
|
||||
.map_err(ApiError::InternalServerError)
|
||||
}
|
||||
|
||||
/// Common functionality of all the HTTP API handlers.
|
||||
///
|
||||
/// - Adds a tracing span to each request (by `request_span`)
|
||||
@@ -2224,5 +2317,6 @@ pub fn make_router(
|
||||
|r| api_handler(r, timeline_collect_keyspace),
|
||||
)
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.get("/v1/utilization", |r| api_handler(r, get_utilization))
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ pub(crate) mod statvfs;
|
||||
pub mod task_mgr;
|
||||
pub mod tenant;
|
||||
pub mod trace;
|
||||
pub mod utilization;
|
||||
pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
pub mod walrecord;
|
||||
@@ -168,15 +169,6 @@ pub fn is_delete_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
|
||||
if let Some(e) = e.io_error() {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
|
||||
/// blocking.
|
||||
///
|
||||
|
||||
@@ -642,26 +642,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
|
||||
.expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
|
||||
});
|
||||
|
||||
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
||||
// or in testing they estimate how much we would upload if we did.
|
||||
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_created_persistent_files_total",
|
||||
"Number of files created that are meant to be uploaded to cloud storage",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
"Total bytes written that are meant to be uploaded to cloud storage",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_eviction_iteration_duration_seconds_global",
|
||||
@@ -1802,8 +1782,6 @@ pub(crate) struct TimelineMetrics {
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
|
||||
pub num_persistent_files_created: IntCounter,
|
||||
pub persistent_bytes_written: IntCounter,
|
||||
pub evictions: IntCounter,
|
||||
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
|
||||
}
|
||||
@@ -1885,12 +1863,6 @@ impl TimelineMetrics {
|
||||
};
|
||||
let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
|
||||
Lazy::new(Box::new(directory_entries_count_gauge_closure));
|
||||
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions = EVICTIONS
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -1912,8 +1884,6 @@ impl TimelineMetrics {
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
directory_entries_count_gauge,
|
||||
num_persistent_files_created,
|
||||
persistent_bytes_written,
|
||||
evictions,
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
@@ -1923,8 +1893,6 @@ impl TimelineMetrics {
|
||||
|
||||
pub(crate) fn record_new_file_metrics(&self, sz: u64) {
|
||||
self.resident_physical_size_add(sz);
|
||||
self.num_persistent_files_created.inc_by(1);
|
||||
self.persistent_bytes_written.inc_by(sz);
|
||||
}
|
||||
|
||||
pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
|
||||
@@ -1957,9 +1925,6 @@ impl Drop for TimelineMetrics {
|
||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||
let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
}
|
||||
let _ =
|
||||
NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
|
||||
@@ -15,7 +15,6 @@ use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -36,6 +35,8 @@ use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LsnForTimestamp {
|
||||
/// Found commits both before and after the given timestamp
|
||||
@@ -157,7 +158,6 @@ impl Timeline {
|
||||
pending_updates: HashMap::new(),
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
pending_aux_files: None,
|
||||
pending_directory_entries: Vec::new(),
|
||||
lsn,
|
||||
}
|
||||
@@ -873,11 +873,6 @@ pub struct DatadirModification<'a> {
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_nblocks: i64,
|
||||
|
||||
// If we already wrote any aux file changes in this modification, stash the latest dir. If set,
|
||||
// [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
|
||||
// if AUX_FILES_KEY is already set.
|
||||
pending_aux_files: Option<AuxFilesDirectory>,
|
||||
|
||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||
/// if it was updated in this modification.
|
||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||
@@ -1401,19 +1396,28 @@ impl<'a> DatadirModification<'a> {
|
||||
Some(Bytes::copy_from_slice(content))
|
||||
};
|
||||
|
||||
let dir = if let Some(mut dir) = self.pending_aux_files.take() {
|
||||
let n_files;
|
||||
let mut aux_files = self.tline.aux_files.lock().await;
|
||||
if let Some(mut dir) = aux_files.dir.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value
|
||||
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
|
||||
dir.upsert(file_path, content);
|
||||
dir
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
n_files = dir.files.len();
|
||||
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
aux_files.n_deltas = 0;
|
||||
} else {
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
|
||||
);
|
||||
aux_files.n_deltas += 1;
|
||||
}
|
||||
aux_files.dir = Some(dir);
|
||||
} else {
|
||||
// Check if the AUX_FILES_KEY is initialized
|
||||
match self.get(AUX_FILES_KEY, ctx).await {
|
||||
@@ -1428,7 +1432,8 @@ impl<'a> DatadirModification<'a> {
|
||||
}),
|
||||
);
|
||||
dir.upsert(file_path, content);
|
||||
dir
|
||||
n_files = dir.files.len();
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::AncestorStopping(_)
|
||||
@@ -1455,14 +1460,14 @@ impl<'a> DatadirModification<'a> {
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
dir
|
||||
n_files = 1;
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, dir.files.len()));
|
||||
self.pending_aux_files = Some(dir);
|
||||
.push((DirectoryKind::AuxFiles, n_files));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1493,7 +1498,7 @@ impl<'a> DatadirModification<'a> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut writer = self.tline.writer().await;
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
@@ -1532,23 +1537,13 @@ impl<'a> DatadirModification<'a> {
|
||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||
///
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let mut writer = self.tline.writer().await;
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
if !self.pending_updates.is_empty() {
|
||||
let prev_pending_updates = std::mem::take(&mut self.pending_updates);
|
||||
|
||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||
// so we do that first.
|
||||
let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
|
||||
.into_iter()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
|
||||
.kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
|
||||
.collect();
|
||||
|
||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||
writer.put_batch(&self.pending_updates, ctx).await?;
|
||||
self.pending_updates.clear();
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,6 @@ use remote_storage::TimeoutOrCancel;
|
||||
use std::fmt;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::sync::watch;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -146,6 +145,7 @@ macro_rules! pausable_failpoint {
|
||||
|
||||
pub mod blob_io;
|
||||
pub mod block_io;
|
||||
pub mod vectored_blob_io;
|
||||
|
||||
pub mod disk_btree;
|
||||
pub(crate) mod ephemeral_file;
|
||||
@@ -172,9 +172,6 @@ pub(crate) mod throttle;
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
|
||||
// re-export for use in remote_timeline_client.rs
|
||||
pub use crate::tenant::metadata::save_metadata;
|
||||
|
||||
// re-export for use in walreceiver
|
||||
pub use crate::tenant::timeline::WalReceiverInfo;
|
||||
|
||||
@@ -230,7 +227,11 @@ pub(crate) struct TenantPreload {
|
||||
/// When we spawn a tenant, there is a special mode for tenant creation that
|
||||
/// avoids trying to read anything from remote storage.
|
||||
pub(crate) enum SpawnMode {
|
||||
Normal,
|
||||
/// Activate as soon as possible
|
||||
Eager,
|
||||
/// Lazy activation in the background, with the option to skip the queue if the need comes up
|
||||
Lazy,
|
||||
/// Tenant has been created during the lifetime of this process
|
||||
Create,
|
||||
}
|
||||
|
||||
@@ -703,41 +704,37 @@ impl Tenant {
|
||||
.and_then(|x| x.initial_tenant_load_remote.take());
|
||||
|
||||
enum AttachType<'a> {
|
||||
// During pageserver startup, we are attaching this tenant lazily in the background
|
||||
Warmup(tokio::sync::SemaphorePermit<'a>),
|
||||
// During pageserver startup, we are attaching this tenant as soon as we can,
|
||||
// because a client tried to access it.
|
||||
/// We are attaching this tenant lazily in the background.
|
||||
Warmup {
|
||||
_permit: tokio::sync::SemaphorePermit<'a>,
|
||||
during_startup: bool
|
||||
},
|
||||
/// We are attaching this tenant as soon as we can, because for example an
|
||||
/// endpoint tried to access it.
|
||||
OnDemand,
|
||||
// During normal operations after startup, we are attaching a tenant.
|
||||
/// During normal operations after startup, we are attaching a tenant, and
|
||||
/// eager attach was requested.
|
||||
Normal,
|
||||
}
|
||||
|
||||
// Before doing any I/O, wait for either or:
|
||||
// - A client to attempt to access to this tenant (on-demand loading)
|
||||
// - A permit to become available in the warmup semaphore (background warmup)
|
||||
//
|
||||
// Some-ness of init_order is how we know if we're attaching during startup or later
|
||||
// in process lifetime.
|
||||
let attach_type = if init_order.is_some() {
|
||||
let attach_type = if matches!(mode, SpawnMode::Lazy) {
|
||||
// Before doing any I/O, wait for at least one of:
|
||||
// - A client attempting to access to this tenant (on-demand loading)
|
||||
// - A permit becoming available in the warmup semaphore (background warmup)
|
||||
|
||||
tokio::select!(
|
||||
_ = tenant_clone.activate_now_sem.acquire() => {
|
||||
permit = tenant_clone.activate_now_sem.acquire() => {
|
||||
let _ = permit.expect("activate_now_sem is never closed");
|
||||
tracing::info!("Activating tenant (on-demand)");
|
||||
AttachType::OnDemand
|
||||
},
|
||||
permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
|
||||
match permit_result {
|
||||
Ok(p) => {
|
||||
tracing::info!("Activating tenant (warmup)");
|
||||
AttachType::Warmup(p)
|
||||
}
|
||||
Err(_) => {
|
||||
// This is unexpected: the warmup semaphore should stay alive
|
||||
// for the lifetime of init_order. Log a warning and proceed.
|
||||
tracing::warn!("warmup_limit semaphore unexpectedly closed");
|
||||
AttachType::Normal
|
||||
}
|
||||
permit = conf.concurrent_tenant_warmup.inner().acquire() => {
|
||||
let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
|
||||
tracing::info!("Activating tenant (warmup)");
|
||||
AttachType::Warmup {
|
||||
_permit,
|
||||
during_startup: init_order.is_some()
|
||||
}
|
||||
|
||||
}
|
||||
_ = tenant_clone.cancel.cancelled() => {
|
||||
// This is safe, but should be pretty rare: it is interesting if a tenant
|
||||
@@ -752,6 +749,8 @@ impl Tenant {
|
||||
},
|
||||
)
|
||||
} else {
|
||||
// SpawnMode::{Create,Eager} always cause jumping ahead of the
|
||||
// concurrent_tenant_warmup queue
|
||||
AttachType::Normal
|
||||
};
|
||||
|
||||
@@ -759,7 +758,7 @@ impl Tenant {
|
||||
(SpawnMode::Create, _) => {
|
||||
None
|
||||
},
|
||||
(SpawnMode::Normal, Some(remote_storage)) => {
|
||||
(SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
|
||||
let _preload_timer = TENANT.preload.start_timer();
|
||||
let res = tenant_clone
|
||||
.preload(remote_storage, task_mgr::shutdown_token())
|
||||
@@ -772,7 +771,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
}
|
||||
(SpawnMode::Normal, None) => {
|
||||
(_, None) => {
|
||||
let _preload_timer = TENANT.preload.start_timer();
|
||||
None
|
||||
}
|
||||
@@ -831,7 +830,7 @@ impl Tenant {
|
||||
let attached = {
|
||||
let _attach_timer = match mode {
|
||||
SpawnMode::Create => None,
|
||||
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
|
||||
SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
|
||||
};
|
||||
tenant_clone.attach(preload, mode, &ctx).await
|
||||
};
|
||||
@@ -853,7 +852,7 @@ impl Tenant {
|
||||
// It also prevents the warmup proccess competing with the concurrency limit on
|
||||
// logical size calculations: if logical size calculation semaphore is saturated,
|
||||
// then warmup will wait for that before proceeding to the next tenant.
|
||||
if let AttachType::Warmup(_permit) = attach_type {
|
||||
if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
|
||||
let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
|
||||
tracing::info!("Waiting for initial logical sizes while warming up...");
|
||||
while futs.next().await.is_some() {}
|
||||
@@ -926,7 +925,7 @@ impl Tenant {
|
||||
deleting: false,
|
||||
timelines: HashMap::new(),
|
||||
},
|
||||
(None, SpawnMode::Normal) => {
|
||||
(None, _) => {
|
||||
anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
|
||||
}
|
||||
};
|
||||
@@ -1151,17 +1150,6 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
// timeline loading after attach expects to find metadata file for each metadata
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&self.tenant_shard_id,
|
||||
&timeline_id,
|
||||
&remote_metadata,
|
||||
)
|
||||
.await
|
||||
.context("save_metadata")
|
||||
.map_err(LoadLocalTimelineError::Load)?;
|
||||
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
resources,
|
||||
@@ -2588,19 +2576,24 @@ impl Tenant {
|
||||
legacy_config_path: &Utf8Path,
|
||||
location_conf: &LocationConf,
|
||||
) -> anyhow::Result<()> {
|
||||
// Forward compat: write out an old-style configuration that old versions can read, in case we roll back
|
||||
Self::persist_tenant_config_legacy(
|
||||
tenant_shard_id,
|
||||
legacy_config_path,
|
||||
&location_conf.tenant_conf,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let LocationMode::Attached(attach_conf) = &location_conf.mode {
|
||||
// Once we use LocationMode, generations are mandatory. If we aren't using generations,
|
||||
// then drop out after writing legacy-style config.
|
||||
// The modern-style LocationConf config file requires a generation to be set. In case someone
|
||||
// is running a pageserver without the infrastructure to set generations, write out the legacy-style
|
||||
// config file that only contains TenantConf.
|
||||
//
|
||||
// This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
|
||||
|
||||
if attach_conf.generation.is_none() {
|
||||
tracing::debug!("Running without generations, not writing new-style LocationConf");
|
||||
tracing::info!(
|
||||
"Running without generations, writing legacy-style tenant config file"
|
||||
);
|
||||
Self::persist_tenant_config_legacy(
|
||||
tenant_shard_id,
|
||||
legacy_config_path,
|
||||
&location_conf.tenant_conf,
|
||||
)
|
||||
.await?;
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
@@ -2623,17 +2616,10 @@ impl Tenant {
|
||||
|
||||
let tenant_shard_id = *tenant_shard_id;
|
||||
let config_path = config_path.to_owned();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
Handle::current().block_on(async move {
|
||||
let conf_content = conf_content.into_bytes();
|
||||
VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("write tenant {tenant_shard_id} config to {config_path}")
|
||||
})
|
||||
})
|
||||
})
|
||||
.await??;
|
||||
let conf_content = conf_content.into_bytes();
|
||||
VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
|
||||
.await
|
||||
.with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -2660,17 +2646,12 @@ impl Tenant {
|
||||
|
||||
let tenant_shard_id = *tenant_shard_id;
|
||||
let target_config_path = target_config_path.to_owned();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
Handle::current().block_on(async move {
|
||||
let conf_content = conf_content.into_bytes();
|
||||
VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("write tenant {tenant_shard_id} config to {target_config_path}")
|
||||
})
|
||||
})
|
||||
})
|
||||
.await??;
|
||||
let conf_content = conf_content.into_bytes();
|
||||
VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("write tenant {tenant_shard_id} config to {target_config_path}")
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3293,10 +3274,7 @@ impl Tenant {
|
||||
|
||||
timeline_struct.init_empty_layer_map(start_lsn);
|
||||
|
||||
if let Err(e) = self
|
||||
.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
|
||||
.await
|
||||
{
|
||||
if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
|
||||
error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
|
||||
cleanup_timeline_directory(uninit_mark);
|
||||
return Err(e);
|
||||
@@ -3313,26 +3291,13 @@ impl Tenant {
|
||||
))
|
||||
}
|
||||
|
||||
async fn create_timeline_files(
|
||||
&self,
|
||||
timeline_path: &Utf8Path,
|
||||
new_timeline_id: &TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
|
||||
|
||||
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
|
||||
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
|
||||
});
|
||||
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&self.tenant_shard_id,
|
||||
new_timeline_id,
|
||||
new_metadata,
|
||||
)
|
||||
.await
|
||||
.context("Failed to create timeline metadata")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3499,9 +3464,8 @@ impl Tenant {
|
||||
// Run each timeline's flush in a task holding the timeline's gate: this
|
||||
// means that if this function's future is cancelled, the Timeline shutdown
|
||||
// will still wait for any I/O in here to complete.
|
||||
let gate = match timeline.gate.enter() {
|
||||
Ok(g) => g,
|
||||
Err(_) => continue,
|
||||
let Ok(gate) = timeline.gate.enter() else {
|
||||
continue;
|
||||
};
|
||||
let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await });
|
||||
results.push(jh);
|
||||
@@ -3671,6 +3635,7 @@ pub(crate) mod harness {
|
||||
compaction_target_size: Some(tenant_conf.compaction_target_size),
|
||||
compaction_period: Some(tenant_conf.compaction_period),
|
||||
compaction_threshold: Some(tenant_conf.compaction_threshold),
|
||||
compaction_algorithm: Some(tenant_conf.compaction_algorithm),
|
||||
gc_horizon: Some(tenant_conf.gc_horizon),
|
||||
gc_period: Some(tenant_conf.gc_period),
|
||||
image_creation_threshold: Some(tenant_conf.image_creation_threshold),
|
||||
@@ -3684,7 +3649,6 @@ pub(crate) mod harness {
|
||||
evictions_low_residence_duration_metric_threshold: Some(
|
||||
tenant_conf.evictions_low_residence_duration_metric_threshold,
|
||||
),
|
||||
gc_feedback: Some(tenant_conf.gc_feedback),
|
||||
heatmap_period: Some(tenant_conf.heatmap_period),
|
||||
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
|
||||
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
|
||||
@@ -3807,7 +3771,7 @@ pub(crate) mod harness {
|
||||
let preload = tenant
|
||||
.preload(&self.remote_storage, CancellationToken::new())
|
||||
.await?;
|
||||
tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
|
||||
tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
|
||||
|
||||
tenant.state.send_replace(TenantState::Active);
|
||||
for timeline in tenant.timelines.lock().unwrap().values() {
|
||||
@@ -3891,7 +3855,7 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -3903,7 +3867,7 @@ mod tests {
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -3969,7 +3933,7 @@ mod tests {
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
|
||||
@@ -4003,7 +3967,7 @@ mod tests {
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
let mut new_writer = newtline.writer().await;
|
||||
let new_writer = newtline.writer().await;
|
||||
new_writer
|
||||
.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
|
||||
.await?;
|
||||
@@ -4035,7 +3999,7 @@ mod tests {
|
||||
) -> anyhow::Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
{
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
// Create a relation on the timeline
|
||||
writer
|
||||
.put(
|
||||
@@ -4060,7 +4024,7 @@ mod tests {
|
||||
}
|
||||
tline.freeze_and_flush().await?;
|
||||
{
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -4423,7 +4387,7 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -4440,7 +4404,7 @@ mod tests {
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
|
||||
.await?;
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -4457,7 +4421,7 @@ mod tests {
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
|
||||
.await?;
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -4474,7 +4438,7 @@ mod tests {
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
|
||||
.await?;
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -4531,7 +4495,7 @@ mod tests {
|
||||
for _ in 0..repeat {
|
||||
for _ in 0..key_count {
|
||||
test_key.field6 = blknum;
|
||||
let mut writer = timeline.writer().await;
|
||||
let writer = timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -4702,7 +4666,7 @@ mod tests {
|
||||
for blknum in 0..NUM_KEYS {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
test_key.field6 = blknum as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -4723,7 +4687,7 @@ mod tests {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = blknum as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -4791,7 +4755,7 @@ mod tests {
|
||||
for blknum in 0..NUM_KEYS {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
test_key.field6 = blknum as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -4820,7 +4784,7 @@ mod tests {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = blknum as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -4897,7 +4861,7 @@ mod tests {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = blknum as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
use super::ephemeral_file::EphemeralFile;
|
||||
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
||||
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use bytes::Bytes;
|
||||
use std::ops::Deref;
|
||||
@@ -78,7 +78,7 @@ impl<'a> Deref for BlockLease<'a> {
|
||||
///
|
||||
/// Unlike traits, we also support the read function to be async though.
|
||||
pub(crate) enum BlockReaderRef<'a> {
|
||||
FileBlockReader(&'a FileBlockReader),
|
||||
FileBlockReader(&'a FileBlockReader<'a>),
|
||||
EphemeralFile(&'a EphemeralFile),
|
||||
Adapter(Adapter<&'a DeltaLayerInner>),
|
||||
#[cfg(test)]
|
||||
@@ -160,17 +160,15 @@ impl<'a> BlockCursor<'a> {
|
||||
///
|
||||
/// The file is assumed to be immutable. This doesn't provide any functions
|
||||
/// for modifying the file, nor for invalidating the cache if it is modified.
|
||||
pub struct FileBlockReader {
|
||||
pub file: VirtualFile,
|
||||
pub struct FileBlockReader<'a> {
|
||||
pub file: &'a VirtualFile,
|
||||
|
||||
/// Unique ID of this file, used as key in the page cache.
|
||||
file_id: page_cache::FileId,
|
||||
}
|
||||
|
||||
impl FileBlockReader {
|
||||
pub fn new(file: VirtualFile) -> Self {
|
||||
let file_id = page_cache::next_file_id();
|
||||
|
||||
impl<'a> FileBlockReader<'a> {
|
||||
pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
|
||||
FileBlockReader { file_id, file }
|
||||
}
|
||||
|
||||
@@ -190,11 +188,11 @@ impl FileBlockReader {
|
||||
/// Returns a "lease" object that can be used to
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
pub async fn read_blk(
|
||||
pub async fn read_blk<'b>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
) -> Result<BlockLease<'b>, std::io::Error> {
|
||||
let cache = page_cache::get();
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum, ctx)
|
||||
@@ -215,7 +213,7 @@ impl FileBlockReader {
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for FileBlockReader {
|
||||
impl BlockReader for FileBlockReader<'_> {
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
BlockCursor::new(BlockReaderRef::FileBlockReader(self))
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
use anyhow::bail;
|
||||
use pageserver_api::models::CompactionAlgorithm;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
@@ -20,6 +21,7 @@ use std::time::Duration;
|
||||
use utils::generation::Generation;
|
||||
|
||||
pub mod defaults {
|
||||
|
||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
@@ -27,12 +29,17 @@ pub mod defaults {
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
|
||||
|
||||
// FIXME the below configs are only used by legacy algorithm. The new algorithm
|
||||
// has different parameters.
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
|
||||
super::CompactionAlgorithm::Legacy;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
|
||||
@@ -305,6 +312,7 @@ pub struct TenantConf {
|
||||
pub compaction_period: Duration,
|
||||
// Level0 delta layer threshold for compaction.
|
||||
pub compaction_threshold: usize,
|
||||
pub compaction_algorithm: CompactionAlgorithm,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is #of bytes of WAL.
|
||||
@@ -339,7 +347,6 @@ pub struct TenantConf {
|
||||
// See the corresponding metric's help string.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub evictions_low_residence_duration_metric_threshold: Duration,
|
||||
pub gc_feedback: bool,
|
||||
|
||||
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
||||
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
||||
@@ -378,6 +385,10 @@ pub struct TenantConfOpt {
|
||||
#[serde(default)]
|
||||
pub compaction_threshold: Option<usize>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub gc_horizon: Option<u64>,
|
||||
@@ -427,10 +438,6 @@ pub struct TenantConfOpt {
|
||||
#[serde(default)]
|
||||
pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub gc_feedback: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
#[serde(default)]
|
||||
@@ -462,6 +469,9 @@ impl TenantConfOpt {
|
||||
compaction_threshold: self
|
||||
.compaction_threshold
|
||||
.unwrap_or(global_conf.compaction_threshold),
|
||||
compaction_algorithm: self
|
||||
.compaction_algorithm
|
||||
.unwrap_or(global_conf.compaction_algorithm),
|
||||
gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
|
||||
gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
|
||||
image_creation_threshold: self
|
||||
@@ -485,7 +495,6 @@ impl TenantConfOpt {
|
||||
evictions_low_residence_duration_metric_threshold: self
|
||||
.evictions_low_residence_duration_metric_threshold
|
||||
.unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
|
||||
gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
|
||||
heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
|
||||
lazy_slru_download: self
|
||||
.lazy_slru_download
|
||||
@@ -509,6 +518,7 @@ impl Default for TenantConf {
|
||||
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period"),
|
||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||
compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
@@ -530,7 +540,6 @@ impl Default for TenantConf {
|
||||
DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
|
||||
)
|
||||
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
|
||||
gc_feedback: false,
|
||||
heatmap_period: Duration::ZERO,
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
@@ -587,6 +596,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
Self {
|
||||
checkpoint_distance: value.checkpoint_distance,
|
||||
checkpoint_timeout: value.checkpoint_timeout.map(humantime),
|
||||
compaction_algorithm: value.compaction_algorithm,
|
||||
compaction_target_size: value.compaction_target_size,
|
||||
compaction_period: value.compaction_period.map(humantime),
|
||||
compaction_threshold: value.compaction_threshold,
|
||||
@@ -603,7 +613,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
evictions_low_residence_duration_metric_threshold: value
|
||||
.evictions_low_residence_duration_metric_threshold
|
||||
.map(humantime),
|
||||
gc_feedback: value.gc_feedback,
|
||||
heatmap_period: value.heatmap_period.map(humantime),
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
|
||||
@@ -420,7 +420,7 @@ impl DeleteTenantFlow {
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant
|
||||
.attach(preload, super::SpawnMode::Normal, ctx)
|
||||
.attach(preload, super::SpawnMode::Eager, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
|
||||
@@ -8,20 +8,11 @@
|
||||
//!
|
||||
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||
|
||||
use std::io::{self};
|
||||
|
||||
use anyhow::{ensure, Context};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use anyhow::ensure;
|
||||
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
||||
use thiserror::Error;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
|
||||
/// Use special format number to enable backward compatibility.
|
||||
const METADATA_FORMAT_VERSION: u16 = 4;
|
||||
|
||||
@@ -268,32 +259,6 @@ impl Serialize for TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub async fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
data: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
||||
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
|
||||
.await
|
||||
.context("write metadata")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum LoadMetadataError {
|
||||
#[error(transparent)]
|
||||
Read(#[from] io::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
Decode(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -42,7 +42,7 @@ use crate::tenant::config::{
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::fs_ext::PathExt;
|
||||
@@ -359,12 +359,6 @@ fn load_tenant_config(
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
|
||||
if tenant_ignore_mark_file.exists() {
|
||||
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tenant_shard_id = match tenant_dir_path
|
||||
.file_name()
|
||||
.unwrap_or_default()
|
||||
@@ -377,6 +371,59 @@ fn load_tenant_config(
|
||||
}
|
||||
};
|
||||
|
||||
// Clean up legacy `metadata` files.
|
||||
// Doing it here because every single tenant directory is visited here.
|
||||
// In any later code, there's different treatment of tenant dirs
|
||||
// ... depending on whether the tenant is in re-attach response or not
|
||||
// ... epending on whether the tenant is ignored or not
|
||||
assert_eq!(
|
||||
&conf.tenant_path(&tenant_shard_id),
|
||||
&tenant_dir_path,
|
||||
"later use of conf....path() methods would be dubious"
|
||||
);
|
||||
let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
|
||||
Ok(iter) => {
|
||||
let mut timelines = Vec::new();
|
||||
for res in iter {
|
||||
let p = res?;
|
||||
let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
|
||||
// skip any entries that aren't TimelineId, such as
|
||||
// - *.___temp dirs
|
||||
// - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
|
||||
continue;
|
||||
};
|
||||
timelines.push(timeline_id);
|
||||
}
|
||||
timelines
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
|
||||
Err(e) => return Err(anyhow::anyhow!(e)),
|
||||
};
|
||||
for timeline_id in timelines {
|
||||
let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
let metadata_path = timeline_path.join(METADATA_FILE_NAME);
|
||||
match std::fs::remove_file(&metadata_path) {
|
||||
Ok(()) => {
|
||||
crashsafe::fsync(timeline_path)
|
||||
.context("fsync timeline dir after removing legacy metadata file")?;
|
||||
info!("removed legacy metadata file at {metadata_path}");
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
// something removed the file earlier, or it was never there
|
||||
// We don't care, this software version doesn't write it again, so, we're good.
|
||||
}
|
||||
Err(e) => {
|
||||
anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
|
||||
if tenant_ignore_mark_file.exists() {
|
||||
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some((
|
||||
tenant_shard_id,
|
||||
Tenant::load_tenant_config(conf, &tenant_shard_id),
|
||||
@@ -548,7 +595,7 @@ pub async fn init_tenant_mgr(
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
@@ -1059,9 +1106,9 @@ impl TenantManager {
|
||||
|
||||
// Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
|
||||
// the caller thinks they're creating but the tenant already existed. We must switch to
|
||||
// Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
|
||||
// Eager mode so that when starting this Tenant we properly probe remote storage for timelines,
|
||||
// rather than assuming it to be empty.
|
||||
spawn_mode = SpawnMode::Normal;
|
||||
spawn_mode = SpawnMode::Eager;
|
||||
}
|
||||
Some(TenantSlot::Secondary(state)) => {
|
||||
info!("Shutting down secondary tenant");
|
||||
@@ -1253,7 +1300,7 @@ impl TenantManager {
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Normal,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)?;
|
||||
|
||||
@@ -1474,7 +1521,7 @@ impl TenantManager {
|
||||
*child_shard,
|
||||
child_location_conf,
|
||||
None,
|
||||
SpawnMode::Normal,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -2017,7 +2064,7 @@ pub(crate) async fn load_tenant(
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)
|
||||
.with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
|
||||
|
||||
@@ -88,14 +88,7 @@ pub async fn download_layer_file<'a>(
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other);
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
|
||||
|
||||
match bytes_amount {
|
||||
Ok(bytes_amount) => {
|
||||
@@ -107,7 +100,7 @@ pub async fn download_layer_file<'a>(
|
||||
on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
|
||||
}
|
||||
|
||||
Err(e)
|
||||
Err(e.into())
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -245,10 +238,7 @@ async fn do_download_index_part(
|
||||
let stream = download.download_stream;
|
||||
let mut stream = StreamReader::new(stream);
|
||||
|
||||
tokio::io::copy_buf(&mut stream, &mut bytes)
|
||||
.await
|
||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
tokio::io::copy_buf(&mut stream, &mut bytes).await?;
|
||||
|
||||
Ok(bytes)
|
||||
},
|
||||
@@ -428,14 +418,7 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file);
|
||||
|
||||
// TODO: this consumption of the response body should be subject to timeout + cancellation, but
|
||||
// not without thinking carefully about how to recover safely from cancelling a write to
|
||||
// local storage (e.g. by writing into a temp file as we do in download_layer)
|
||||
// FIXME: flip the weird error wrapping
|
||||
tokio::io::copy_buf(&mut download, &mut writer)
|
||||
.await
|
||||
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
tokio::io::copy_buf(&mut download, &mut writer).await?;
|
||||
|
||||
let mut file = writer.into_inner();
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ use crate::tenant::{
|
||||
remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
|
||||
};
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use futures::Future;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -44,7 +45,7 @@ use rand::Rng;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
use tracing::{info_span, instrument, warn, Instrument};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
|
||||
};
|
||||
@@ -438,8 +439,14 @@ impl From<std::io::Error> for UpdateError {
|
||||
fn from(value: std::io::Error) -> Self {
|
||||
if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
|
||||
UpdateError::NoSpace
|
||||
} else if value
|
||||
.get_ref()
|
||||
.and_then(|x| x.downcast_ref::<DownloadError>())
|
||||
.is_some()
|
||||
{
|
||||
UpdateError::from(DownloadError::from(value))
|
||||
} else {
|
||||
// An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
|
||||
// An I/O error from e.g. tokio::io::copy_buf is most likely a remote storage issue
|
||||
UpdateError::Other(anyhow::anyhow!(value))
|
||||
}
|
||||
}
|
||||
@@ -484,14 +491,9 @@ impl<'a> TenantDownloader<'a> {
|
||||
let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
|
||||
let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
|
||||
let heatmap_path_bg = heatmap_path.clone();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
tokio::runtime::Handle::current().block_on(async move {
|
||||
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
|
||||
})
|
||||
})
|
||||
.await
|
||||
.expect("Blocking task is never aborted")
|
||||
.maybe_fatal_err(&context_msg)?;
|
||||
VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes)
|
||||
.await
|
||||
.maybe_fatal_err(&context_msg)?;
|
||||
|
||||
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
|
||||
|
||||
@@ -672,20 +674,17 @@ impl<'a> TenantDownloader<'a> {
|
||||
.await
|
||||
{
|
||||
Ok(bytes) => bytes,
|
||||
Err(e) => {
|
||||
if let DownloadError::NotFound = e {
|
||||
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
|
||||
// This is harmless: continue to download the next layer. It is expected during compaction
|
||||
// GC.
|
||||
tracing::debug!(
|
||||
"Skipped downloading missing layer {}, raced with compaction/gc?",
|
||||
layer.name
|
||||
);
|
||||
continue;
|
||||
} else {
|
||||
return Err(e.into());
|
||||
}
|
||||
Err(DownloadError::NotFound) => {
|
||||
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
|
||||
// This is harmless: continue to download the next layer. It is expected during compaction
|
||||
// GC.
|
||||
tracing::debug!(
|
||||
"Skipped downloading missing layer {}, raced with compaction/gc?",
|
||||
layer.name
|
||||
);
|
||||
continue;
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
if downloaded_bytes != layer.metadata.file_size {
|
||||
@@ -775,19 +774,33 @@ async fn init_timeline_state(
|
||||
.await
|
||||
.fatal_err(&format!("Listing {timeline_path}"))
|
||||
{
|
||||
let dentry_file_name = dentry.file_name();
|
||||
let file_name = dentry_file_name.to_string_lossy();
|
||||
let local_meta = dentry.metadata().await.fatal_err(&format!(
|
||||
"Read metadata on {}",
|
||||
dentry.path().to_string_lossy()
|
||||
));
|
||||
let Ok(file_path) = Utf8PathBuf::from_path_buf(dentry.path()) else {
|
||||
tracing::warn!("Malformed filename at {}", dentry.path().to_string_lossy());
|
||||
continue;
|
||||
};
|
||||
let local_meta = dentry
|
||||
.metadata()
|
||||
.await
|
||||
.fatal_err(&format!("Read metadata on {}", file_path));
|
||||
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
let file_name = file_path.file_name().expect("created it from the dentry");
|
||||
if file_name == METADATA_FILE_NAME {
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
|
||||
continue;
|
||||
} else if crate::is_temporary(&file_path) {
|
||||
// Temporary files are frequently left behind from restarting during downloads
|
||||
tracing::info!("Cleaning up temporary file {file_path}");
|
||||
if let Err(e) = tokio::fs::remove_file(&file_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
{
|
||||
tracing::error!("Failed to remove temporary file {file_path}: {e}");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
match LayerFileName::from_str(&file_name) {
|
||||
match LayerFileName::from_str(file_name) {
|
||||
Ok(name) => {
|
||||
let remote_meta = heatmap_metadata.get(&name);
|
||||
match remote_meta {
|
||||
|
||||
@@ -373,12 +373,9 @@ async fn upload_tenant_heatmap(
|
||||
// Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
|
||||
// when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
|
||||
// in remote storage.
|
||||
let _guard = match tenant.gate.enter() {
|
||||
Ok(g) => g,
|
||||
Err(_) => {
|
||||
tracing::info!("Skipping heatmap upload for tenant which is shutting down");
|
||||
return Err(UploadHeatmapError::Cancelled);
|
||||
}
|
||||
let Ok(_guard) = tenant.gate.enter() else {
|
||||
tracing::info!("Skipping heatmap upload for tenant which is shutting down");
|
||||
return Err(UploadHeatmapError::Cancelled);
|
||||
};
|
||||
|
||||
for (timeline_id, timeline) in timelines {
|
||||
|
||||
@@ -209,8 +209,7 @@ impl Default for ValuesReconstructState {
|
||||
pub(crate) enum ReadableLayerDesc {
|
||||
Persistent {
|
||||
desc: PersistentLayerDesc,
|
||||
lsn_floor: Lsn,
|
||||
lsn_ceil: Lsn,
|
||||
lsn_range: Range<Lsn>,
|
||||
},
|
||||
InMemory {
|
||||
handle: InMemoryLayerHandle,
|
||||
@@ -309,14 +308,14 @@ impl Eq for ReadableLayerDescOrdered {}
|
||||
impl ReadableLayerDesc {
|
||||
pub(crate) fn get_lsn_floor(&self) -> Lsn {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
|
||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
|
||||
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
|
||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
|
||||
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
|
||||
}
|
||||
}
|
||||
@@ -329,10 +328,15 @@ impl ReadableLayerDesc {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { desc, lsn_ceil, .. } => {
|
||||
ReadableLayerDesc::Persistent { desc, lsn_range } => {
|
||||
let layer = layer_manager.get_from_desc(desc);
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
|
||||
.get_values_reconstruct_data(
|
||||
keyspace,
|
||||
lsn_range.clone(),
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
|
||||
|
||||
@@ -29,25 +29,28 @@
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::BytesMut;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
@@ -63,8 +66,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValueReconstructSituation,
|
||||
ValuesReconstructState,
|
||||
AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -214,8 +216,10 @@ pub struct DeltaLayerInner {
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader,
|
||||
file: VirtualFile,
|
||||
file_id: FileId,
|
||||
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayerInner {
|
||||
@@ -297,7 +301,7 @@ impl DeltaLayer {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded = DeltaLayerInner::load(&path, None, ctx)
|
||||
let loaded = DeltaLayerInner::load(&path, None, None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
|
||||
@@ -665,16 +669,16 @@ impl DeltaLayer {
|
||||
where
|
||||
F: Fn(Summary) -> Summary,
|
||||
{
|
||||
let file = VirtualFile::open_with_options(
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
path,
|
||||
virtual_file::OpenOptions::new().read(true).write(true),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0, ctx).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
|
||||
let mut file = file.file;
|
||||
if actual_summary.magic != DELTA_FILE_MAGIC {
|
||||
return Err(RewriteSummaryError::MagicMismatch);
|
||||
}
|
||||
@@ -698,15 +702,18 @@ impl DeltaLayerInner {
|
||||
pub(super) async fn load(
|
||||
path: &Utf8Path,
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
let file = FileBlockReader::new(file);
|
||||
let file_id = page_cache::next_file_id();
|
||||
|
||||
let summary_blk = match file.read_blk(0, ctx).await {
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
|
||||
let summary_blk = match block_reader.read_blk(0, ctx).await {
|
||||
Ok(blk) => blk,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||
};
|
||||
@@ -730,8 +737,10 @@ impl DeltaLayerInner {
|
||||
|
||||
Ok(Ok(DeltaLayerInner {
|
||||
file,
|
||||
file_id,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
max_vectored_read_bytes,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -744,11 +753,11 @@ impl DeltaLayerInner {
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let mut need_image = true;
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = &self.file;
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
&block_reader,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
@@ -782,19 +791,19 @@ impl DeltaLayerInner {
|
||||
.build();
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let cursor = file.block_cursor();
|
||||
let cursor = block_reader.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor
|
||||
.read_blob_into_buf(pos, &mut buf, ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to read blob from virtual file {}", file.file.path)
|
||||
format!("Failed to read blob from virtual file {}", self.file.path)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path
|
||||
self.file.path
|
||||
)
|
||||
})?;
|
||||
match val {
|
||||
@@ -834,133 +843,181 @@ impl DeltaLayerInner {
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
end_lsn: Lsn,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let file = &self.file;
|
||||
let reads = self
|
||||
.plan_reads(keyspace, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
self.do_reads_and_update_state(reads, reconstruct_state)
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn plan_reads(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>> {
|
||||
let mut planner = VectoredReadPlanner::new(
|
||||
self.max_vectored_read_bytes
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into(),
|
||||
);
|
||||
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
block_reader,
|
||||
);
|
||||
|
||||
let mut offsets: BTreeMap<Key, Vec<(Lsn, u64)>> = BTreeMap::new();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut ignore_key = None;
|
||||
let mut range_end_handled = false;
|
||||
|
||||
// Scan the page versions backwards, starting from the last key in the range.
|
||||
// to collect all the offsets at which need to be read.
|
||||
let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1));
|
||||
let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
|
||||
tree_reader
|
||||
.visit(
|
||||
&end_key.0,
|
||||
VisitDirection::Backwards,
|
||||
&start_key.0,
|
||||
VisitDirection::Forwards,
|
||||
|raw_key, value| {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
|
||||
|
||||
if entry_lsn >= end_lsn {
|
||||
return true;
|
||||
}
|
||||
|
||||
if key < range.start {
|
||||
return false;
|
||||
}
|
||||
|
||||
if key >= range.end {
|
||||
return true;
|
||||
}
|
||||
|
||||
if Some(key) == ignore_key {
|
||||
return true;
|
||||
}
|
||||
|
||||
if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
|
||||
if entry_lsn <= cached_lsn {
|
||||
return key != range.start;
|
||||
}
|
||||
}
|
||||
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
|
||||
let blob_ref = BlobRef(value);
|
||||
let lsns_at = offsets.entry(key).or_default();
|
||||
lsns_at.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
if blob_ref.will_init() {
|
||||
if key == range.start {
|
||||
return false;
|
||||
assert!(key >= range.start && lsn >= lsn_range.start);
|
||||
|
||||
let cached_lsn = reconstruct_state.get_cached_lsn(&key);
|
||||
let flag = {
|
||||
if cached_lsn >= Some(lsn) {
|
||||
BlobFlag::Ignore
|
||||
} else if blob_ref.will_init() {
|
||||
BlobFlag::Replaces
|
||||
} else {
|
||||
ignore_key = Some(key);
|
||||
return true;
|
||||
BlobFlag::None
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
true
|
||||
if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
|
||||
planner.handle_range_end(blob_ref.pos());
|
||||
range_end_handled = true;
|
||||
false
|
||||
} else {
|
||||
planner.handle(key, lsn, blob_ref.pos(), flag);
|
||||
true
|
||||
}
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await
|
||||
.map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
|
||||
}
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
|
||||
let ctx = &RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerValue)
|
||||
.build();
|
||||
|
||||
let cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, lsns_at) in offsets {
|
||||
for (lsn, block_offset) in lsns_at {
|
||||
let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await;
|
||||
|
||||
if let Err(e) = res {
|
||||
reconstruct_state.on_key_error(
|
||||
key,
|
||||
PageReconstructError::from(anyhow!(e).context(format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path
|
||||
))),
|
||||
);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
let value = Value::des(&buf);
|
||||
if let Err(e) = value {
|
||||
reconstruct_state.on_key_error(
|
||||
key,
|
||||
PageReconstructError::from(anyhow!(e).context(format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path
|
||||
))),
|
||||
);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
let key_situation = reconstruct_state.update_key(&key, lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
break;
|
||||
}
|
||||
if !range_end_handled {
|
||||
let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
tracing::info!("Handling range end fallback at {}", payload_end);
|
||||
planner.handle_range_end(payload_end);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(planner.finish())
|
||||
}
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
) {
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
let mut ignore_key_with_err = None;
|
||||
|
||||
let max_vectored_read_bytes = self
|
||||
.max_vectored_read_bytes
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into();
|
||||
let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
||||
|
||||
// Note that reads are processed in reverse order (from highest key+lsn).
|
||||
// This is the order that `ReconstructState` requires such that it can
|
||||
// track when a key is done.
|
||||
for read in reads.into_iter().rev() {
|
||||
let res = vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"))
|
||||
.await;
|
||||
|
||||
let blobs_buf = match res {
|
||||
Ok(blobs_buf) => blobs_buf,
|
||||
Err(err) => {
|
||||
let kind = err.kind();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
reconstruct_state.on_key_error(
|
||||
blob_meta.key,
|
||||
PageReconstructError::from(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.file.path,
|
||||
kind
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
// We have "lost" the buffer since the lower level IO api
|
||||
// doesn't return the buffer on error. Allocate a new one.
|
||||
buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
for meta in blobs_buf.blobs.iter().rev() {
|
||||
if Some(meta.meta.key) == ignore_key_with_err {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
|
||||
let value = match value {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::from(anyhow!(e).context(format!(
|
||||
"Failed to deserialize blob from virtual file {}",
|
||||
self.file.path,
|
||||
))),
|
||||
);
|
||||
|
||||
ignore_key_with_err = Some(meta.meta.key);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
|
||||
// state, no further updates shall be made to it. The call below will
|
||||
// panic if the invariant is violated.
|
||||
reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn load_keys<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<DeltaEntry<'a>>> {
|
||||
let file = &self.file;
|
||||
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
block_reader,
|
||||
);
|
||||
|
||||
let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();
|
||||
@@ -1012,11 +1069,11 @@ impl DeltaLayerInner {
|
||||
self.index_start_blk, self.index_root_blk
|
||||
);
|
||||
|
||||
let file = &self.file;
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
block_reader,
|
||||
);
|
||||
|
||||
tree_reader.dump().await?;
|
||||
@@ -1111,7 +1168,8 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
self.0.as_ref().file.read_blk(blknum, ctx).await
|
||||
let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id);
|
||||
block_reader.read_blk(blknum, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1120,3 +1178,15 @@ impl AsRef<DeltaLayerInner> for DeltaLayerInner {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for DeltaEntry<'a> {
|
||||
fn key(&self) -> Key {
|
||||
self.key
|
||||
}
|
||||
fn lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
//! actual page images are stored in the "values" part.
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
@@ -34,11 +34,14 @@ use crate::tenant::storage_layer::{
|
||||
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
@@ -152,8 +155,10 @@ pub struct ImageLayerInner {
|
||||
|
||||
lsn: Lsn,
|
||||
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader,
|
||||
file: VirtualFile,
|
||||
file_id: FileId,
|
||||
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
@@ -167,9 +172,12 @@ impl std::fmt::Debug for ImageLayerInner {
|
||||
|
||||
impl ImageLayerInner {
|
||||
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let file = &self.file;
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
block_reader,
|
||||
);
|
||||
|
||||
tree_reader.dump().await?;
|
||||
|
||||
@@ -252,7 +260,7 @@ impl ImageLayer {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
|
||||
@@ -327,16 +335,16 @@ impl ImageLayer {
|
||||
where
|
||||
F: Fn(Summary) -> Summary,
|
||||
{
|
||||
let file = VirtualFile::open_with_options(
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
path,
|
||||
virtual_file::OpenOptions::new().read(true).write(true),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0, ctx).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
|
||||
let mut file = file.file;
|
||||
if actual_summary.magic != IMAGE_FILE_MAGIC {
|
||||
return Err(RewriteSummaryError::MagicMismatch);
|
||||
}
|
||||
@@ -361,14 +369,16 @@ impl ImageLayerInner {
|
||||
path: &Utf8Path,
|
||||
lsn: Lsn,
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = match file.read_blk(0, ctx).await {
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = match block_reader.read_blk(0, ctx).await {
|
||||
Ok(blk) => blk,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||
};
|
||||
@@ -399,6 +409,8 @@ impl ImageLayerInner {
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
lsn,
|
||||
file,
|
||||
file_id,
|
||||
max_vectored_read_bytes,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -408,8 +420,9 @@ impl ImageLayerInner {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
@@ -422,7 +435,7 @@ impl ImageLayerInner {
|
||||
)
|
||||
.await?
|
||||
{
|
||||
let blob = file
|
||||
let blob = block_reader
|
||||
.block_cursor()
|
||||
.read_blob(
|
||||
offset,
|
||||
@@ -449,12 +462,36 @@ impl ImageLayerInner {
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
|
||||
let reads = self
|
||||
.plan_reads(keyspace, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
let mut offsets = Vec::new();
|
||||
self.do_reads_and_update_state(reads, reconstruct_state)
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn plan_reads(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>> {
|
||||
let mut planner = VectoredReadPlanner::new(
|
||||
self.max_vectored_read_bytes
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into(),
|
||||
);
|
||||
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
range.start.write_to_byte_slice(&mut search_key);
|
||||
|
||||
@@ -462,17 +499,18 @@ impl ImageLayerInner {
|
||||
.visit(
|
||||
&search_key,
|
||||
VisitDirection::Forwards,
|
||||
|raw_key, value| {
|
||||
|raw_key, offset| {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
assert!(key >= range.start);
|
||||
|
||||
if !range.contains(&key) {
|
||||
return false;
|
||||
if key >= range.end {
|
||||
planner.handle_range_end(offset);
|
||||
range_end_handled = true;
|
||||
false
|
||||
} else {
|
||||
planner.handle(key, self.lsn, offset, BlobFlag::None);
|
||||
true
|
||||
}
|
||||
|
||||
offsets.push((key, value));
|
||||
|
||||
true
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
|
||||
@@ -480,33 +518,60 @@ impl ImageLayerInner {
|
||||
)
|
||||
.await
|
||||
.map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
|
||||
}
|
||||
|
||||
let ctx = &RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerValue)
|
||||
.build();
|
||||
|
||||
let cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, offset) in offsets {
|
||||
let res = cursor.read_blob_into_buf(offset, &mut buf, ctx).await;
|
||||
if let Err(e) = res {
|
||||
reconstruct_state.on_key_error(
|
||||
key,
|
||||
PageReconstructError::from(anyhow!(e).context(format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path
|
||||
))),
|
||||
);
|
||||
|
||||
continue;
|
||||
if !range_end_handled {
|
||||
let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
planner.handle_range_end(payload_end);
|
||||
}
|
||||
|
||||
let blob = Bytes::copy_from_slice(buf.as_slice());
|
||||
reconstruct_state.update_key(&key, self.lsn, Value::Image(blob));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(planner.finish())
|
||||
}
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
) {
|
||||
let max_vectored_read_bytes = self
|
||||
.max_vectored_read_bytes
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into();
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
for read in reads.into_iter() {
|
||||
let buf = BytesMut::with_capacity(max_vectored_read_bytes);
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf).await;
|
||||
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
reconstruct_state.update_key(
|
||||
&meta.meta.key,
|
||||
self.lsn,
|
||||
Value::Image(img_buf),
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
let kind = err.kind();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
reconstruct_state.on_key_error(
|
||||
blob_meta.key,
|
||||
PageReconstructError::from(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.file.path,
|
||||
kind
|
||||
)),
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -336,17 +336,32 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
|
||||
pub(crate) async fn put_value(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
val: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
|
||||
self.put_value_locked(&mut inner, key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn put_values(
|
||||
&self,
|
||||
values: &HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
for (key, vals) in values {
|
||||
for (lsn, val) in vals {
|
||||
self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn put_value_locked(
|
||||
@@ -354,16 +369,22 @@ impl InMemoryLayer {
|
||||
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
val: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
|
||||
let off = {
|
||||
// Avoid doing allocations for "small" values.
|
||||
// In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
|
||||
// https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
|
||||
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
|
||||
buf.clear();
|
||||
val.ser_into(&mut buf)?;
|
||||
locked_inner
|
||||
.file
|
||||
.write_blob(
|
||||
buf,
|
||||
&buf,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build(),
|
||||
@@ -391,12 +412,7 @@ impl InMemoryLayer {
|
||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||
let inner = self.inner.write().await;
|
||||
|
||||
assert!(
|
||||
self.start_lsn < end_lsn,
|
||||
"{} >= {}",
|
||||
self.start_lsn,
|
||||
end_lsn
|
||||
);
|
||||
assert!(self.start_lsn < end_lsn);
|
||||
self.end_lsn.set(end_lsn).expect("end_lsn set only once");
|
||||
|
||||
for vec_map in inner.index.values() {
|
||||
|
||||
@@ -29,6 +29,9 @@ use super::{
|
||||
|
||||
use utils::generation::Generation;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
@@ -267,7 +270,7 @@ impl Layer {
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
end_lsn: Lsn,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
@@ -282,7 +285,7 @@ impl Layer {
|
||||
.record_access(LayerAccessKind::GetValueReconstructData, ctx);
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx)
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
|
||||
.instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
|
||||
.await
|
||||
}
|
||||
@@ -1049,16 +1052,10 @@ impl LayerInner {
|
||||
|
||||
/// `DownloadedLayer` is being dropped, so it calls this method.
|
||||
fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
|
||||
let delete = self.wanted_deleted.load(Ordering::Acquire);
|
||||
let evict = self.wanted_evicted.load(Ordering::Acquire);
|
||||
let can_evict = self.have_remote_client;
|
||||
|
||||
if delete {
|
||||
// do nothing now, only in LayerInner::drop -- this was originally implemented because
|
||||
// we could had already scheduled the deletion at the time.
|
||||
//
|
||||
// FIXME: this is not true anymore, we can safely evict wanted deleted files.
|
||||
} else if can_evict && evict {
|
||||
if can_evict && evict {
|
||||
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
|
||||
|
||||
// downgrade for queueing, in case there's a tear down already ongoing we should not
|
||||
@@ -1299,9 +1296,14 @@ impl DownloadedLayer {
|
||||
owner.desc.key_range.clone(),
|
||||
owner.desc.lsn_range.clone(),
|
||||
));
|
||||
delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Delta))
|
||||
delta_layer::DeltaLayerInner::load(
|
||||
&owner.path,
|
||||
summary,
|
||||
Some(owner.conf.max_vectored_read_bytes),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Delta))
|
||||
} else {
|
||||
let lsn = owner.desc.image_layer_lsn();
|
||||
let summary = Some(image_layer::Summary::expected(
|
||||
@@ -1310,9 +1312,15 @@ impl DownloadedLayer {
|
||||
owner.desc.key_range.clone(),
|
||||
lsn,
|
||||
));
|
||||
image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Image))
|
||||
image_layer::ImageLayerInner::load(
|
||||
&owner.path,
|
||||
lsn,
|
||||
summary,
|
||||
Some(owner.conf.max_vectored_read_bytes),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Image))
|
||||
};
|
||||
|
||||
match res {
|
||||
@@ -1365,7 +1373,7 @@ impl DownloadedLayer {
|
||||
async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
end_lsn: Lsn,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValuesReconstructState,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
@@ -1374,7 +1382,7 @@ impl DownloadedLayer {
|
||||
|
||||
match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
|
||||
Delta(d) => {
|
||||
d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx)
|
||||
d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
Image(i) => {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user