Compare commits

..

12 Commits

Author SHA1 Message Date
John Spray
41d546df84 clippy 2024-02-14 12:34:07 +00:00
John Spray
792c8ced06 add assertion on span 2024-02-14 12:32:07 +00:00
John Spray
0415f28734 fsync newly populated timeline dirs 2024-02-14 12:29:15 +00:00
John Spray
fcc2eb88a1 Refactor hard linking into one spawn_blocking 2024-02-14 12:09:15 +00:00
John Spray
3d6ad0c42e pageserver: avoid using eviction code in split 2024-02-14 11:30:22 +00:00
John Spray
acf6c05f7e Merge remote-tracking branch 'upstream/main' into jcsp/storcon-split-refine 2024-02-14 11:13:03 +00:00
John Spray
5df352638f De-duplicate background purge task 2024-02-12 13:54:41 +00:00
John Spray
d8a6942e0e Refactor detach methods into TenantManager 2024-02-12 13:47:48 +00:00
John Spray
54d683ae07 pageserver: hard-link layers during shard split 2024-02-12 09:50:27 +00:00
John Spray
0526603adb pageserver: remove parent shard files after split 2024-02-12 09:50:27 +00:00
John Spray
bb299f0229 control_plane: improved logging on re-attach 2024-02-12 09:50:27 +00:00
John Spray
6d631ae816 tests: improve coverage of pageserver restarts 2024-02-12 09:50:27 +00:00
249 changed files with 5500 additions and 15715 deletions

View File

@@ -39,7 +39,7 @@ runs:
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
if [ "${PR_NUMBER}" != "null" ]; then
BRANCH_OR_PR=pr-${PR_NUMBER}
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
# Shortcut for special branches
BRANCH_OR_PR=${GITHUB_REF_NAME}
else
@@ -59,7 +59,7 @@ runs:
BUCKET: neon-github-public-dev
# TODO: We can replace with a special docker image with Java and Allure pre-installed
- uses: actions/setup-java@v4
- uses: actions/setup-java@v3
with:
distribution: 'temurin'
java-version: '17'
@@ -76,8 +76,8 @@ runs:
rm -f ${ALLURE_ZIP}
fi
env:
ALLURE_VERSION: 2.27.0
ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
ALLURE_VERSION: 2.24.0
ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
- name: Acquire lock
@@ -180,7 +180,7 @@ runs:
fi
- name: Cache poetry deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -215,7 +215,7 @@ runs:
rm -rf ${WORKDIR}
fi
- uses: actions/github-script@v7
- uses: actions/github-script@v6
if: always()
env:
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}

View File

@@ -19,7 +19,7 @@ runs:
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
if [ "${PR_NUMBER}" != "null" ]; then
BRANCH_OR_PR=pr-${PR_NUMBER}
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
# Shortcut for special branches
BRANCH_OR_PR=${GITHUB_REF_NAME}
else

View File

@@ -80,13 +80,13 @@ runs:
- name: Checkout
if: inputs.needs_postgres_source == 'true'
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Cache poetry deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}

View File

@@ -16,14 +16,8 @@ concurrency:
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
jobs:
check-permissions:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name}}
actionlint:
needs: [ check-permissions ]
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

View File

@@ -64,7 +64,7 @@ jobs:
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
- uses: actions/checkout@v4
- uses: actions/checkout@v3
with:
ref: main
token: ${{ secrets.CI_ACCESS_TOKEN }}

View File

@@ -62,11 +62,11 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -214,14 +214,14 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
# Increase timeout to 8h, default timeout is 6h
timeout-minutes: 480
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -362,11 +362,11 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -461,11 +461,11 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -558,11 +558,11 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download

View File

@@ -1,105 +0,0 @@
name: Build build-tools image
on:
workflow_call:
inputs:
image-tag:
description: "build-tools image tag"
required: true
type: string
outputs:
image-tag:
description: "build-tools tag"
value: ${{ inputs.image-tag }}
image:
description: "build-tools image"
value: neondatabase/build-tools:${{ inputs.image-tag }}
defaults:
run:
shell: bash -euo pipefail {0}
concurrency:
group: build-build-tools-image-${{ inputs.image-tag }}
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
jobs:
check-image:
uses: ./.github/workflows/check-build-tools-image.yml
# This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
build-image:
needs: [ check-image ]
if: needs.check-image.outputs.found == 'false'
strategy:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
env:
IMAGE_TAG: ${{ inputs.image-tag }}
steps:
- name: Check `input.tag` is correct
env:
INPUTS_IMAGE_TAG: ${{ inputs.image-tag }}
CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }}
run: |
if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then
echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})"
exit 1
fi
- uses: actions/checkout@v3
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p /tmp/.docker-custom
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v2
- uses: docker/login-action@v2
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/build-push-action@v4
with:
context: .
provenance: false
push: true
pull: true
file: Dockerfile.build-tools
cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
- name: Remove custom docker config directory
run: |
rm -rf /tmp/.docker-custom
merge-images:
needs: [ build-image ]
runs-on: ubuntu-latest
env:
IMAGE_TAG: ${{ inputs.image-tag }}
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch image
run: |
docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
neondatabase/build-tools:${IMAGE_TAG}-x64 \
neondatabase/build-tools:${IMAGE_TAG}-arm64

View File

@@ -0,0 +1,124 @@
name: Build and Push Docker Image
on:
workflow_call:
inputs:
dockerfile-path:
required: true
type: string
image-name:
required: true
type: string
outputs:
build-tools-tag:
description: "tag generated for build tools"
value: ${{ jobs.tag.outputs.build-tools-tag }}
jobs:
check-if-build-tools-dockerfile-changed:
runs-on: ubuntu-latest
outputs:
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
steps:
- name: Check if Dockerfile.buildtools has changed
id: dockerfile
run: |
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
exit
fi
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
fi
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
tag:
runs-on: ubuntu-latest
needs: [ check-if-build-tools-dockerfile-changed ]
outputs:
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
steps:
- name: Get buildtools tag
env:
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
run: |
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
IMAGE_TAG=$GITHUB_RUN_ID
else
IMAGE_TAG=pinned
fi
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
shell: bash
id: buildtools-tag
kaniko:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
needs: [ tag, check-if-build-tools-dockerfile-changed ]
runs-on: [ self-hosted, dev, x64 ]
container: gcr.io/kaniko-project/executor:v1.7.0-debug
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: |
/kaniko/executor \
--reproducible \
--snapshotMode=redo \
--skip-unused-stages \
--dockerfile ${{ inputs.dockerfile-path }} \
--cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
kaniko-arm:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
needs: [ tag, check-if-build-tools-dockerfile-changed ]
runs-on: [ self-hosted, dev, arm64 ]
container: gcr.io/kaniko-project/executor:v1.7.0-debug
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: |
/kaniko/executor \
--reproducible \
--snapshotMode=redo \
--skip-unused-stages \
--dockerfile ${{ inputs.dockerfile-path }} \
--cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
manifest:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
name: 'manifest'
runs-on: [ self-hosted, dev, x64 ]
needs:
- tag
- kaniko
- kaniko-arm
- check-if-build-tools-dockerfile-changed
steps:
- name: Create manifest
run: |
docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
- name: Push manifest
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}

View File

@@ -5,7 +5,6 @@ on:
branches:
- main
- release
- release-proxy
pull_request:
defaults:
@@ -28,9 +27,24 @@ env:
jobs:
check-permissions:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name}}
runs-on: ubuntu-latest
steps:
- name: Disallow PRs from forks
if: |
github.event_name == 'pull_request' &&
github.event.pull_request.head.repo.full_name != github.repository
run: |
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
else
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
fi
echo >&2 "We don't run CI for PRs from forks"
echo >&2 "${MESSAGE}"
exit 1
cancel-previous-e2e-tests:
needs: [ check-permissions ]
@@ -55,7 +69,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -68,8 +82,6 @@ jobs:
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
@@ -77,36 +89,30 @@ jobs:
shell: bash
id: build-tag
check-build-tools-image:
build-buildtools-image:
needs: [ check-permissions ]
uses: ./.github/workflows/check-build-tools-image.yml
build-build-tools-image:
needs: [ check-build-tools-image ]
uses: ./.github/workflows/build-build-tools-image.yml
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
dockerfile-path: Dockerfile.buildtools
image-name: build-tools
secrets: inherit
check-codestyle-python:
needs: [ check-permissions, build-build-tools-image ]
needs: [ check-permissions, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: false
fetch-depth: 1
- name: Cache poetry deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -124,18 +130,15 @@ jobs:
run: poetry run mypy .
check-codestyle-rust:
needs: [ check-permissions, build-build-tools-image ]
needs: [ check-permissions, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
@@ -143,7 +146,7 @@ jobs:
# Disabled for now
# - name: Restore cargo deps cache
# id: cache_cargo
# uses: actions/cache@v4
# uses: actions/cache@v3
# with:
# path: |
# !~/.cargo/registry/src
@@ -194,13 +197,10 @@ jobs:
run: cargo deny check --hide-inclusion-graph
build-neon:
needs: [ check-permissions, tag, build-build-tools-image ]
needs: [ check-permissions, tag, build-buildtools-image ]
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# Raise locked memory limit for tokio-epoll-uring.
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
# io_uring will account the memory of the CQ and SQ as locked.
@@ -231,7 +231,7 @@ jobs:
done
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
@@ -253,7 +253,7 @@ jobs:
done
if [ "${FAILED}" = "true" ]; then
echo >&2 "Please update vendor/revisions.json if these changes are intentional"
echo >&2 "Please update vendors/revisions.json if these changes are intentional"
exit 1
fi
@@ -303,7 +303,7 @@ jobs:
# compressed crates.
# - name: Cache cargo deps
# id: cache_cargo
# uses: actions/cache@v4
# uses: actions/cache@v3
# with:
# path: |
# ~/.cargo/registry/
@@ -317,21 +317,21 @@ jobs:
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -438,13 +438,10 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
strategy:
@@ -454,7 +451,7 @@ jobs:
pg_version: [ v14, v15, v16 ]
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
@@ -475,34 +472,26 @@ jobs:
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
PAGESERVER_GET_VECTORED_IMPL: vectored
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
- name: Merge and upload coverage data
if: |
false &&
matrix.build_type == 'debug' && matrix.pg_version == 'v14'
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
uses: ./.github/actions/save-coverage-data
get-benchmarks-durations:
outputs:
json: ${{ steps.get-benchmark-durations.outputs.json }}
needs: [ check-permissions, build-build-tools-image ]
needs: [ check-permissions, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Cache poetry deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -521,13 +510,10 @@ jobs:
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
benchmarks:
needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -539,7 +525,7 @@ jobs:
build_type: [ release ]
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Pytest benchmarks
uses: ./.github/actions/run-python-test-set
@@ -559,19 +545,16 @@ jobs:
# while coverage is currently collected for the debug ones
create-test-report:
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Create Allure report
if: ${{ !cancelled() }}
@@ -582,7 +565,7 @@ jobs:
env:
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- uses: actions/github-script@v7
- uses: actions/github-script@v6
if: ${{ !cancelled() }}
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -608,13 +591,10 @@ jobs:
})
coverage-report:
needs: [ check-permissions, regress-tests, build-build-tools-image ]
needs: [ check-permissions, regress-tests, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
strategy:
fail-fast: false
@@ -625,7 +605,7 @@ jobs:
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
@@ -694,7 +674,7 @@ jobs:
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
- uses: actions/github-script@v7
- uses: actions/github-script@v6
env:
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -712,146 +692,166 @@ jobs:
})
trigger-e2e-tests:
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
needs: [ check-permissions, promote-images, tag ]
uses: ./.github/workflows/trigger-e2e-tests.yml
secrets: inherit
neon-image:
needs: [ check-permissions, build-build-tools-image, tag ]
needs: [ check-permissions, build-buildtools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container: gcr.io/kaniko-project/executor:v1.9.2-debug
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v1 # v3 won't work with kaniko
with:
submodules: true
fetch-depth: 0
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
- name: Configure ECR and Docker Hub login
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v3
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
echo "::add-mask::${DOCKERHUB_AUTH}"
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
cat <<-EOF > /kaniko/.docker/config.json
{
"auths": {
"https://index.docker.io/v1/": {
"auth": "${DOCKERHUB_AUTH}"
}
},
"credHelpers": {
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
}
}
EOF
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Kaniko build neon
run:
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
- uses: docker/build-push-action@v5
with:
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
provenance: false
push: true
pull: true
file: Dockerfile
cache-from: type=registry,ref=neondatabase/neon:cache
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
tags: |
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
neondatabase/neon:${{needs.tag.outputs.build-tag}}
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
- name: Remove custom docker config directory
if: always()
compute-tools-image:
runs-on: [ self-hosted, gen3, large ]
needs: [ check-permissions, build-buildtools-image, tag ]
container: gcr.io/kaniko-project/executor:v1.9.2-debug
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
- name: Configure ECR and Docker Hub login
run: |
rm -rf .docker-custom
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
echo "::add-mask::${DOCKERHUB_AUTH}"
cat <<-EOF > /kaniko/.docker/config.json
{
"auths": {
"https://index.docker.io/v1/": {
"auth": "${DOCKERHUB_AUTH}"
}
},
"credHelpers": {
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
}
}
EOF
- name: Kaniko build compute tools
run:
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-tools
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
--destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
compute-node-image:
needs: [ check-permissions, build-build-tools-image, tag ]
needs: [ check-permissions, build-buildtools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: gcr.io/kaniko-project/executor:v1.9.2-debug
# Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
# Should be prevented by https://github.com/neondatabase/neon/issues/4281
options: --add-host=download.osgeo.org:140.211.15.30
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v1 # v3 won't work with kaniko
with:
submodules: true
fetch-depth: 0
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
- name: Configure ECR and Docker Hub login
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v3
with:
# Disable parallelism for docker buildkit.
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
config-inline: |
[worker.oci]
max-parallelism = 1
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
echo "::add-mask::${DOCKERHUB_AUTH}"
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
cat <<-EOF > /kaniko/.docker/config.json
{
"auths": {
"https://index.docker.io/v1/": {
"auth": "${DOCKERHUB_AUTH}"
}
},
"credHelpers": {
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
}
}
EOF
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Kaniko build compute node with extensions
run:
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg PG_VERSION=${{ matrix.version }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-node
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--cleanup
- name: Build compute-node image
uses: docker/build-push-action@v5
with:
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
PG_VERSION=${{ matrix.version }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
provenance: false
push: true
pull: true
file: Dockerfile.compute-node
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
tags: |
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
if: ${{ matrix.version == 'v16' }}
uses: docker/build-push-action@v5
with:
target: compute-tools-image
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
provenance: false
push: true
pull: true
file: Dockerfile.compute-node
tags: |
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
@@ -895,12 +895,12 @@ jobs:
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ check-permissions, tag, neon-image, compute-node-image ]
needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
runs-on: [ self-hosted, gen3, small ]
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -929,8 +929,7 @@ jobs:
fi
- name: Verify docker-compose example
timeout-minutes: 20
run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
- name: Print logs and clean up
if: always()
@@ -963,7 +962,9 @@ jobs:
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
- name: Add latest tag to images
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
run: |
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -975,7 +976,9 @@ jobs:
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
- name: Push images to production ECR
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
run: |
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -999,7 +1002,9 @@ jobs:
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
- name: Push latest tags to Docker Hub
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
run: |
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -1089,7 +1094,7 @@ jobs:
deploy:
needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1109,7 +1114,7 @@ jobs:
done
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: false
fetch-depth: 0
@@ -1124,29 +1129,15 @@ jobs:
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \
-f deployStorageBroker=true \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-f deployPgSniRouter=true \
-f deployProxy=true \
-f deployStorage=false \
-f deployStorageBroker=false \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}}
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
- name: Create git tag
if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
uses: actions/github-script@v7
if: github.ref_name == 'release'
uses: actions/github-script@v6
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
@@ -1158,10 +1149,9 @@ jobs:
sha: context.sha,
})
# TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
- name: Create GitHub release
if: github.ref_name == 'release'
uses: actions/github-script@v7
uses: actions/github-script@v6
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
@@ -1210,11 +1200,3 @@ jobs:
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
done
pin-build-tools-image:
needs: [ build-build-tools-image, promote-images, regress-tests ]
if: github.ref_name == 'main'
uses: ./.github/workflows/pin-build-tools-image.yml
with:
from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
secrets: inherit

View File

@@ -1,58 +0,0 @@
name: Check build-tools image
on:
workflow_call:
outputs:
image-tag:
description: "build-tools image tag"
value: ${{ jobs.check-image.outputs.tag }}
found:
description: "Whether the image is found in the registry"
value: ${{ jobs.check-image.outputs.found }}
defaults:
run:
shell: bash -euo pipefail {0}
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
jobs:
check-image:
runs-on: ubuntu-latest
outputs:
tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
found: ${{ steps.check-image.outputs.found }}
steps:
- name: Get build-tools image tag for the current commit
id: get-build-tools-tag
env:
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
LAST_BUILD_TOOLS_SHA=$(
gh api \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
--method GET \
--field path=Dockerfile.build-tools \
--field sha=${COMMIT_SHA} \
--field per_page=1 \
--jq ".[0].sha" \
"/repos/${GITHUB_REPOSITORY}/commits"
)
echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
- name: Check if such tag found in the registry
id: check-image
env:
IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
run: |
if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
found=true
else
found=false
fi
echo "found=${found}" | tee -a $GITHUB_OUTPUT

View File

@@ -1,36 +0,0 @@
name: Check Permissions
on:
workflow_call:
inputs:
github-event-name:
required: true
type: string
defaults:
run:
shell: bash -euo pipefail {0}
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
jobs:
check-permissions:
runs-on: ubuntu-latest
steps:
- name: Disallow CI runs on PRs from forks
if: |
inputs.github-event-name == 'pull_request' &&
github.event.pull_request.head.repo.full_name != github.repository
run: |
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
else
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
fi
# TODO: use actions/github-script to post this message as a PR comment
echo >&2 "We don't run CI for PRs from forks"
echo >&2 "${MESSAGE}"
exit 1

View File

@@ -1,32 +0,0 @@
# A workflow from
# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries
name: cleanup caches by a branch
on:
pull_request:
types:
- closed
jobs:
cleanup:
runs-on: ubuntu-latest
steps:
- name: Cleanup
run: |
gh extension install actions/gh-actions-cache
echo "Fetching list of cache key"
cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )
## Setting this to not fail the workflow while deleting cache keys.
set +e
echo "Deleting caches..."
for cacheKey in $cacheKeysForPR
do
gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
done
echo "Done"
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge

View File

@@ -20,25 +20,7 @@ env:
COPT: '-Werror'
jobs:
check-permissions:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name}}
check-build-tools-image:
needs: [ check-permissions ]
uses: ./.github/workflows/check-build-tools-image.yml
build-build-tools-image:
needs: [ check-build-tools-image ]
uses: ./.github/workflows/build-build-tools-image.yml
with:
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
secrets: inherit
check-macos-build:
needs: [ check-permissions ]
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
@@ -75,21 +57,21 @@ jobs:
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -100,7 +82,7 @@ jobs:
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Cache cargo deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: |
~/.cargo/registry
@@ -134,8 +116,8 @@ jobs:
run: ./run_clippy.sh
check-linux-arm-build:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
runs-on: [ self-hosted, dev, arm64 ]
env:
@@ -148,10 +130,7 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
options: --init
steps:
@@ -193,21 +172,21 @@ jobs:
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -258,15 +237,12 @@ jobs:
cargo nextest run --package remote_storage --test test_real_azure
check-codestyle-rust-arm:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
runs-on: [ self-hosted, dev, arm64 ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -333,17 +309,13 @@ jobs:
run: cargo deny check
gather-rust-build-stats:
needs: [ check-permissions, build-build-tools-image ]
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
env:
@@ -384,7 +356,7 @@ jobs:
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
- name: Publish build stats report
uses: actions/github-script@v7
uses: actions/github-script@v6
env:
REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
SHA: ${{ github.event.pull_request.head.sha || github.sha }}

View File

@@ -28,7 +28,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
@@ -38,7 +38,7 @@ jobs:
uses: snok/install-poetry@v1
- name: Cache poetry deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
@@ -82,7 +82,7 @@ jobs:
# It will be fixed after switching to gen2 runner
- name: Upload python test logs
if: always()
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
retention-days: 7
name: python-test-pg_clients-${{ runner.os }}-stage-logs

View File

@@ -1,72 +0,0 @@
name: 'Pin build-tools image'
on:
workflow_dispatch:
inputs:
from-tag:
description: 'Source tag'
required: true
type: string
workflow_call:
inputs:
from-tag:
description: 'Source tag'
required: true
type: string
defaults:
run:
shell: bash -euo pipefail {0}
concurrency:
group: pin-build-tools-image-${{ inputs.from-tag }}
permissions: {}
jobs:
tag-image:
runs-on: ubuntu-latest
env:
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: pinned
steps:
- name: Check if we really need to pin the image
id: check-manifests
run: |
docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json
if diff ${FROM_TAG}.json ${TO_TAG}.json; then
skip=true
else
skip=false
fi
echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
- uses: docker/login-action@v3
if: steps.check-manifests.outputs.skip == 'false'
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
if: steps.check-manifests.outputs.skip == 'false'
run: |
docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
neondatabase/build-tools:${FROM_TAG}
- uses: docker/login-action@v3
if: steps.check-manifests.outputs.skip == 'false'
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
if: steps.check-manifests.outputs.skip == 'false'
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
neondatabase/build-tools:${FROM_TAG}

View File

@@ -2,31 +2,12 @@ name: Create Release Branch
on:
schedule:
# It should be kept in sync with if-condition in jobs
- cron: '0 6 * * MON' # Storage release
- cron: '0 6 * * THU' # Proxy release
- cron: '0 6 * * 1'
workflow_dispatch:
inputs:
create-storage-release-branch:
type: boolean
description: 'Create Storage release PR'
required: false
create-proxy-release-branch:
type: boolean
description: 'Create Proxy release PR'
required: false
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
defaults:
run:
shell: bash -euo pipefail {0}
jobs:
create-storage-release-branch:
if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
runs-on: ubuntu-latest
create_release_branch:
runs-on: [ ubuntu-latest ]
permissions:
contents: write # for `git push`
@@ -37,67 +18,27 @@ jobs:
with:
ref: main
- name: Set environment variables
run: |
echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
- name: Get current date
id: date
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
- name: Create release branch
run: git checkout -b $RELEASE_BRANCH
run: git checkout -b releases/${{ steps.date.outputs.date }}
- name: Push new branch
run: git push origin $RELEASE_BRANCH
run: git push origin releases/${{ steps.date.outputs.date }}
- name: Create pull request into release
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
cat << EOF > body.md
## Release ${RELEASE_DATE}
## Release ${{ steps.date.outputs.date }}
**Please merge this Pull Request using 'Create a merge commit' button**
**Please merge this PR using 'Create a merge commit'!**
EOF
gh pr create --title "Release ${RELEASE_DATE}" \
gh pr create --title "Release ${{ steps.date.outputs.date }}" \
--body-file "body.md" \
--head "${RELEASE_BRANCH}" \
--head "releases/${{ steps.date.outputs.date }}" \
--base "release"
create-proxy-release-branch:
if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
runs-on: ubuntu-latest
permissions:
contents: write # for `git push`
steps:
- name: Check out code
uses: actions/checkout@v4
with:
ref: main
- name: Set environment variables
run: |
echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
- name: Create release branch
run: git checkout -b $RELEASE_BRANCH
- name: Push new branch
run: git push origin $RELEASE_BRANCH
- name: Create pull request into release
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
cat << EOF > body.md
## Proxy release ${RELEASE_DATE}
**Please merge this Pull Request using 'Create a merge commit' button**
EOF
gh pr create --title "Proxy release ${RELEASE_DATE}}" \
--body-file "body.md" \
--head "${RELEASE_BRANCH}" \
--base "release-proxy"

View File

@@ -9,7 +9,7 @@ on:
defaults:
run:
shell: bash -euxo pipefail {0}
env:
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
@@ -37,7 +37,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -51,8 +51,6 @@ jobs:
echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
@@ -117,3 +115,4 @@ jobs:
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
}
}"

View File

@@ -0,0 +1,70 @@
name: 'Update build tools image tag'
# This workflow it used to update tag of build tools in ECR.
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
on:
workflow_dispatch:
inputs:
from-tag:
description: 'Source tag'
required: true
type: string
to-tag:
description: 'Destination tag'
required: true
type: string
default: 'pinned'
defaults:
run:
shell: bash -euo pipefail {0}
permissions: {}
jobs:
tag-image:
runs-on: [ self-hosted, gen3, small ]
env:
ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
steps:
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/login-action@v2
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v2
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- uses: actions/setup-go@v5
with:
go-version: '1.21'
- name: Install crane
run: |
go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
- name: Copy images
run: |
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom

1
.gitignore vendored
View File

@@ -9,7 +9,6 @@ test_output/
neon.iml
/.neon
/integration_tests/.neon
compaction-suite-results.*
# Coverage
*.profraw

View File

@@ -1,10 +1,10 @@
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
/control_plane/attachment_service @neondatabase/storage
/libs/pageserver_api/ @neondatabase/storage
/control_plane/ @neondatabase/compute @neondatabase/storage
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/libs/vm_monitor/ @neondatabase/autoscaling
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
/pageserver/ @neondatabase/storage
/pgxn/ @neondatabase/compute
/proxy/ @neondatabase/proxy

View File

@@ -74,11 +74,16 @@ We're using the following approach to make it work:
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
## How do I make build-tools image "pinned"
## How do I add the "pinned" tag to an buildtools image?
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow.
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
or using GitHub CLI:
```bash
gh workflow -R neondatabase/neon run pin-build-tools-image.yml \
-f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e
```
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-f from-tag=6254913013 \
-f to-tag=pinned \
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
```

69
Cargo.lock generated
View File

@@ -284,10 +284,8 @@ dependencies = [
"diesel_migrations",
"futures",
"git-version",
"humantime",
"hyper",
"metrics",
"once_cell",
"pageserver_api",
"pageserver_client",
"postgres_connection",
@@ -1815,7 +1813,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb"
dependencies = [
"enumset_derive",
"serde",
]
[[package]]
@@ -2760,17 +2757,6 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "leaky-bucket"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
dependencies = [
"parking_lot 0.12.1",
"tokio",
"tracing",
]
[[package]]
name = "libc"
version = "0.2.150"
@@ -3462,7 +3448,6 @@ name = "pageserver"
version = "0.1.0"
dependencies = [
"anyhow",
"arc-swap",
"async-compression",
"async-stream",
"async-trait",
@@ -3490,7 +3475,6 @@ dependencies = [
"humantime-serde",
"hyper",
"itertools",
"leaky-bucket",
"md5",
"metrics",
"nix 0.27.1",
@@ -3498,7 +3482,6 @@ dependencies = [
"num_cpus",
"once_cell",
"pageserver_api",
"pageserver_compaction",
"pin-project-lite",
"postgres",
"postgres-protocol",
@@ -3553,9 +3536,7 @@ dependencies = [
"const_format",
"enum-map",
"hex",
"humantime",
"humantime-serde",
"itertools",
"postgres_ffi",
"rand 0.8.5",
"serde",
@@ -3589,53 +3570,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "pageserver_compaction"
version = "0.1.0"
dependencies = [
"anyhow",
"async-compression",
"async-stream",
"async-trait",
"byteorder",
"bytes",
"chrono",
"clap",
"const_format",
"consumption_metrics",
"criterion",
"crossbeam-utils",
"either",
"fail",
"flate2",
"futures",
"git-version",
"hex",
"hex-literal",
"humantime",
"humantime-serde",
"itertools",
"metrics",
"once_cell",
"pageserver_api",
"pin-project-lite",
"rand 0.8.5",
"smallvec",
"svg_fmt",
"sync_wrapper",
"thiserror",
"tokio",
"tokio-io-timeout",
"tokio-util",
"tracing",
"tracing-error",
"tracing-subscriber",
"url",
"utils",
"walkdir",
"workspace_hack",
]
[[package]]
name = "parking"
version = "2.1.1"
@@ -4502,7 +4436,6 @@ dependencies = [
"futures",
"futures-util",
"http-types",
"humantime",
"hyper",
"itertools",
"metrics",
@@ -4514,7 +4447,6 @@ dependencies = [
"serde_json",
"test-context",
"tokio",
"tokio-stream",
"tokio-util",
"toml_edit",
"tracing",
@@ -6413,7 +6345,6 @@ dependencies = [
"hex-literal",
"hyper",
"jsonwebtoken",
"leaky-bucket",
"metrics",
"nix 0.27.1",
"once_cell",

View File

@@ -5,7 +5,6 @@ members = [
"control_plane",
"control_plane/attachment_service",
"pageserver",
"pageserver/compaction",
"pageserver/ctl",
"pageserver/client",
"pageserver/pagebench",
@@ -98,7 +97,6 @@ ipnet = "2.9.0"
itertools = "0.10"
jsonwebtoken = "9"
lasso = "0.7"
leaky-bucket = "1.0.1"
libc = "0.2"
md5 = "0.7.0"
memoffset = "0.8"
@@ -200,7 +198,6 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
metrics = { version = "0.1", path = "./libs/metrics/" }
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
pageserver_client = { path = "./pageserver/client" }
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }

View File

@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
# Show build caching stats to check if it was used in the end.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
&& mold -run cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \

View File

@@ -769,40 +769,6 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
#########################################################################################
#
# Layer "pg_ivm"
# compile pg_ivm extension
#
#########################################################################################
FROM build-deps AS pg-ivm-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
#########################################################################################
#
# Layer "pg_partman"
# compile pg_partman extension
#
#########################################################################################
FROM build-deps AS pg-partman-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
@@ -844,8 +810,6 @@ COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -856,10 +820,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_test_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_rmgr \
@@ -891,17 +851,7 @@ ENV BUILD_TAG=$BUILD_TAG
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#
# Final compute-tools image
#
#########################################################################################
FROM debian:bullseye-slim AS compute-tools-image
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#

32
Dockerfile.compute-tools Normal file
View File

@@ -0,0 +1,32 @@
# First transient image to build compute_tools binaries
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
ARG REPOSITORY=neondatabase
ARG IMAGE=build-tools
ARG TAG=pinned
ARG BUILD_TAG
FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
WORKDIR /home/nonroot
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
ARG RUSTC_WRAPPER=cachepot
ENV AWS_REGION=eu-central-1
ENV CACHEPOT_S3_KEY_PREFIX=cachepot
ARG CACHEPOT_BUCKET=neon-github-dev
#ARG AWS_ACCESS_KEY_ID
#ARG AWS_SECRET_ACCESS_KEY
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
COPY . .
RUN set -e \
&& mold -run cargo build -p compute_tools --locked --release \
&& cachepot -s
# Final image that only has one binary
FROM debian:bullseye-slim
COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl

View File

@@ -159,8 +159,8 @@ neon-pg-ext-%: postgres-%
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
.PHONY: neon-pg-clean-ext-%
neon-pg-clean-ext-%:
.PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
@@ -216,11 +216,11 @@ neon-pg-ext: \
neon-pg-ext-v15 \
neon-pg-ext-v16
.PHONY: neon-pg-clean-ext
neon-pg-clean-ext: \
neon-pg-clean-ext-v14 \
neon-pg-clean-ext-v15 \
neon-pg-clean-ext-v16
.PHONY: neon-pg-ext-clean
neon-pg-ext-clean: \
neon-pg-ext-clean-v14 \
neon-pg-ext-clean-v15 \
neon-pg-ext-clean-v16
# shorthand to build all Postgres versions
.PHONY: postgres
@@ -249,7 +249,7 @@ postgres-check: \
# This doesn't remove the effects of 'configure'.
.PHONY: clean
clean: postgres-clean neon-pg-clean-ext
clean: postgres-clean neon-pg-ext-clean
$(CARGO_CMD_PREFIX) cargo clean
# This removes everything

View File

@@ -5,7 +5,7 @@
Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
## Quick start
Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
Alternatively, compile and run the project [locally](#running-local-installation).
@@ -230,10 +230,6 @@ postgres=# select * from t;
> cargo neon stop
```
#### Handling build failures
If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
## Running tests
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
@@ -253,22 +249,6 @@ testing locally, it is convenient to run just one set of permutations, like this
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
```
## Flamegraphs
You may find yourself in need of flamegraphs for software in this repository.
You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice!
>[!IMPORTANT]
> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument.
> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
## Cleanup
For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned!
## Documentation
[docs](/docs) Contains a top-level overview of all available markdown documentation.

View File

@@ -45,6 +45,7 @@ use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use nix::sys::signal::{kill, Signal};
use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info};
@@ -52,9 +53,7 @@ use url::Url;
use compute_api::responses::ComputeStatus;
use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
};
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version;
use compute_tools::http::api::launch_http_server;
@@ -395,15 +394,6 @@ fn main() -> Result<()> {
info!("synced safekeepers at lsn {lsn}");
}
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::TerminationPending {
state.status = ComputeStatus::Terminated;
compute.state_changed.notify_all();
// we were asked to terminate gracefully, don't exit to avoid restart
delay_exit = true
}
drop(state);
if let Err(err) = compute.check_for_core_dumps() {
error!("error while checking for core dumps: {err:?}");
}
@@ -533,7 +523,16 @@ fn cli() -> clap::Command {
/// wait for termination which would be easy then.
fn handle_exit_signal(sig: i32) {
info!("received {sig} termination signal");
forward_termination_signal();
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
if ss_pid != 0 {
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
kill(ss_pid, Signal::SIGTERM).ok();
}
let pg_pid = PG_PID.load(Ordering::SeqCst);
if pg_pid != 0 {
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
kill(pg_pid, Signal::SIGTERM).ok();
}
exit(1);
}

View File

@@ -28,8 +28,6 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
use utils::measured_stream::MeasuredReader;
use nix::sys::signal::{kill, Signal};
use remote_storage::{DownloadError, RemotePath};
use crate::checker::create_availability_check_data;
@@ -326,8 +324,7 @@ impl ComputeNode {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let start_time = Instant::now();
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let mut config = postgres::Config::from_str(shard0_connstr)?;
let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
// Use the storage auth token from the config file, if given.
// Note: this overrides any password set in the connection string.
@@ -1324,17 +1321,3 @@ LIMIT 100",
Ok(remote_ext_metrics)
}
}
pub fn forward_termination_signal() {
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
if ss_pid != 0 {
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
kill(ss_pid, Signal::SIGTERM).ok();
}
let pg_pid = PG_PID.load(Ordering::SeqCst);
if pg_pid != 0 {
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
// use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
kill(pg_pid, Signal::SIGQUIT).ok();
}
}

View File

@@ -51,9 +51,6 @@ pub fn write_postgres_conf(
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if !spec.safekeeper_connstrings.is_empty() {
writeln!(
file,
@@ -82,12 +79,6 @@ pub fn write_postgres_conf(
ComputeMode::Replica => {
// hot_standby is 'on' by default, but let's be explicit
writeln!(file, "hot_standby=on")?;
// Inform the replica about the primary state
// Default is 'false'
if let Some(primary_is_running) = spec.primary_is_running {
writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
}
}
}

View File

@@ -5,7 +5,6 @@ use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
@@ -124,17 +123,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
(&Method::POST, "/terminate") => {
info!("serving /terminate POST request");
match handle_terminate_request(compute).await {
Ok(()) => Response::new(Body::empty()),
Err((msg, code)) => {
error!("error handling /terminate request: {msg}");
render_json_error(&msg, code)
}
}
}
// download extension files from remote extension storage on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
@@ -309,49 +297,6 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
.unwrap()
}
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
{
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::Terminated {
return Ok(());
}
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for termination request: {:?}",
state.status.clone()
);
return Err((msg, StatusCode::PRECONDITION_FAILED));
}
state.status = ComputeStatus::TerminationPending;
compute.state_changed.notify_all();
drop(state);
}
forward_termination_signal();
info!("sent signal and notified waiters");
// Spawn a blocking thread to wait for compute to become Terminated.
// This is needed to do not block the main pool of workers and
// be able to serve other requests while some particular request
// is waiting for compute to finish configuration.
let c = compute.clone();
task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Terminated {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become Terminated, current status: {:?}",
state.status
);
}
Ok(())
})
.await
.unwrap()?;
info!("terminated Postgres");
Ok(())
}
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(port: u16, state: Arc<ComputeNode>) {

View File

@@ -168,29 +168,6 @@ paths:
schema:
$ref: "#/components/schemas/GenericError"
/terminate:
post:
tags:
- Terminate
summary: Terminate Postgres and wait for it to exit
description: ""
operationId: terminate
responses:
200:
description: Result
412:
description: "wrong state"
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: "Unexpected error"
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:
JWT:

View File

@@ -655,9 +655,6 @@ pub fn handle_grants(
// remove this code if possible. The worst thing that could happen is that
// user won't be able to use public schema in NEW databases created in the
// very OLD project.
//
// Also, alter default permissions so that relations created by extensions can be
// used by neon_superuser without permission issues.
let grant_query = "DO $$\n\
BEGIN\n\
IF EXISTS(\n\
@@ -676,15 +673,6 @@ pub fn handle_grants(
GRANT CREATE ON SCHEMA public TO web_access;\n\
END IF;\n\
END IF;\n\
IF EXISTS(\n\
SELECT nspname\n\
FROM pg_catalog.pg_namespace\n\
WHERE nspname = 'public'\n\
)\n\
THEN\n\
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
END IF;\n\
END\n\
$$;"
.to_string();
@@ -789,12 +777,6 @@ BEGIN
END
$$;"#,
"GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
// Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
"",
"",
"",
"",
// Add new migrations below.
];
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
@@ -821,13 +803,8 @@ $$;"#,
client.simple_query(query)?;
while current_migration < migrations.len() {
let migration = &migrations[current_migration];
if migration.is_empty() {
info!("Skip migration id={}", current_migration);
} else {
info!("Running migration:\n{}\n", migration);
client.simple_query(migration)?;
}
info!("Running migration:\n{}\n", migrations[current_migration]);
client.simple_query(migrations[current_migration])?;
current_migration += 1;
}
let setval = format!(

View File

@@ -4,11 +4,6 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
# Enables test-only APIs and behaviors
testing = []
[dependencies]
anyhow.workspace = true
aws-config.workspace = true
@@ -18,8 +13,6 @@ clap.workspace = true
futures.workspace = true
git-version.workspace = true
hyper.workspace = true
humantime.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
postgres_connection.workspace = true

View File

@@ -1,9 +0,0 @@
use utils::auth::{AuthError, Claims, Scope};
pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
if claims.scope != required_scope {
return Err(AuthError("Scope mismatch. Permission denied".into()));
}
Ok(())
}

View File

@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
use control_plane::local_env::LocalEnv;
use hyper::{Method, StatusCode};
use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, TenantShardId};
use postgres_connection::parse_host_port;
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
@@ -77,7 +77,7 @@ impl ComputeHookTenant {
self.shards
.sort_by_key(|(shard, _node_id)| shard.shard_number);
if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
// We have pageservers for all the shards: emit a configuration update
return Some(ComputeHookNotifyRequest {
tenant_id,
@@ -94,7 +94,7 @@ impl ComputeHookTenant {
tracing::info!(
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
self.shards.len(),
shard_count.count()
shard_count.0
);
}
@@ -155,7 +155,7 @@ impl ComputeHook {
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
endpoint.reconfigure(compute_pageservers.clone()).await?;
}
}
@@ -177,7 +177,7 @@ impl ComputeHook {
req
};
tracing::info!(
tracing::debug!(
"Sending notify request to {} ({:?})",
url,
reconfigure_request
@@ -266,7 +266,7 @@ impl ComputeHook {
/// periods, but we don't retry forever. The **caller** is responsible for handling failures and
/// ensuring that they eventually call again to ensure that the compute is eventually notified of
/// the proper pageserver nodes for a tenant.
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
#[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
pub(super) async fn notify(
&self,
tenant_shard_id: TenantShardId,
@@ -298,7 +298,7 @@ impl ComputeHook {
let Some(reconfigure_request) = reconfigure_request else {
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
// until it does.
tracing::info!("Tenant isn't yet ready to emit a notification");
tracing::debug!("Tenant isn't yet ready to emit a notification",);
return Ok(());
};

View File

@@ -4,15 +4,15 @@ use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use pageserver_api::models::{
TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest,
TimelineCreateRequest,
};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use std::sync::Arc;
use std::time::{Duration, Instant};
use utils::auth::{Scope, SwappableJwtAuth};
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
use utils::http::request::{must_get_query_param, parse_request_param};
use utils::auth::SwappableJwtAuth;
use utils::http::endpoint::{auth_middleware, request_span};
use utils::http::request::parse_request_param;
use utils::id::{TenantId, TimelineId};
use utils::{
@@ -25,12 +25,12 @@ use utils::{
id::NodeId,
};
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
};
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
use control_plane::attachment_service::{
AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
TenantShardMigrateRequest,
};
/// State available to HTTP request handlers
#[derive(Clone)]
@@ -64,18 +64,21 @@ fn get_state(request: &Request<Body>) -> &HttpState {
/// Pageserver calls into this on startup, to learn which tenants it should attach
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::GenerationsApi)?;
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
json_response(
StatusCode::OK,
state
.service
.re_attach(reattach_req)
.await
.map_err(ApiError::InternalServerError)?,
)
}
/// Pageserver calls into this before doing deletions, to confirm that it still
/// holds the latest generation for the tenants with deletions enqueued
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::GenerationsApi)?;
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.validate(validate_req))
@@ -85,8 +88,6 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
/// (in the real control plane this is unnecessary, because the same program is managing
/// generation numbers and doing attachments).
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
let state = get_state(&req);
@@ -101,8 +102,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
}
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
let state = get_state(&req);
@@ -114,13 +113,8 @@ async fn handle_tenant_create(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::PageServerApi)?;
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
json_response(
StatusCode::CREATED,
service.tenant_create(create_req).await?,
)
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
}
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
@@ -174,8 +168,6 @@ async fn handle_tenant_location_config(
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
json_response(
StatusCode::OK,
@@ -185,47 +177,11 @@ async fn handle_tenant_location_config(
)
}
async fn handle_tenant_time_travel_remote_storage(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
let timestamp_raw = must_get_query_param(&req, "travel_to")?;
let _timestamp = humantime::parse_rfc3339(&timestamp_raw).map_err(|_e| {
ApiError::BadRequest(anyhow::anyhow!(
"Invalid time for travel_to: {timestamp_raw:?}"
))
})?;
let done_if_after_raw = must_get_query_param(&req, "done_if_after")?;
let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| {
ApiError::BadRequest(anyhow::anyhow!(
"Invalid time for done_if_after: {done_if_after_raw:?}"
))
})?;
service
.tenant_time_travel_remote_storage(
&time_travel_req,
tenant_id,
timestamp_raw,
done_if_after_raw,
)
.await?;
json_response(StatusCode::OK, ())
}
async fn handle_tenant_delete(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
deletion_wrapper(service, move |service| async move {
service.tenant_delete(tenant_id).await
@@ -238,11 +194,9 @@ async fn handle_tenant_timeline_create(
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
json_response(
StatusCode::CREATED,
StatusCode::OK,
service
.tenant_timeline_create(tenant_id, create_req)
.await?,
@@ -254,8 +208,6 @@ async fn handle_tenant_timeline_delete(
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
deletion_wrapper(service, move |service| async move {
@@ -269,7 +221,6 @@ async fn handle_tenant_timeline_passthrough(
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let Some(path) = req.uri().path_and_query() else {
// This should never happen, our request router only calls us if there is a path
@@ -313,15 +264,11 @@ async fn handle_tenant_locate(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
let state = get_state(&req);
state.service.node_register(register_req).await?;
@@ -329,23 +276,17 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
}
async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.node_list().await?)
}
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
}
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let node_id: NodeId = parse_request_param(&req, "node_id")?;
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
if node_id != config_req.node_id {
@@ -355,18 +296,13 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
}
let state = get_state(&req);
json_response(
StatusCode::OK,
state.service.node_configure(config_req).await?,
)
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
}
async fn handle_tenant_shard_split(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
@@ -380,8 +316,6 @@ async fn handle_tenant_shard_migrate(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
json_response(
@@ -394,35 +328,11 @@ async fn handle_tenant_shard_migrate(
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
}
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
state.service.tenants_dump()
}
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
state.service.scheduler_dump()
}
async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.consistency_check().await?)
}
/// Status endpoint is just used for checking that our HTTP listener is up
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, ())
@@ -474,12 +384,6 @@ where
.await
}
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
check_permission_with(request, |claims| {
crate::auth::check_permission(claims, required_scope)
})
}
pub fn make_router(
service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>,
@@ -517,13 +421,6 @@ pub fn make_router(
.post("/debug/v1/node/:node_id/drop", |r| {
request_span(r, handle_node_drop)
})
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
.get("/debug/v1/scheduler", |r| {
request_span(r, handle_scheduler_dump)
})
.post("/debug/v1/consistency_check", |r| {
request_span(r, handle_consistency_check)
})
.get("/control/v1/tenant/:tenant_id/locate", |r| {
tenant_service_handler(r, handle_tenant_locate)
})
@@ -554,9 +451,6 @@ pub fn make_router(
.put("/v1/tenant/:tenant_id/location_config", |r| {
tenant_service_handler(r, handle_tenant_location_config)
})
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
})
// Timeline operations
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
tenant_service_handler(r, handle_tenant_timeline_delete)

View File

@@ -1,10 +1,8 @@
use serde::{Deserialize, Serialize};
use utils::seqwait::MonotonicCounter;
mod auth;
mod compute_hook;
pub mod http;
pub mod metrics;
mod node;
pub mod persistence;
mod reconciler;
@@ -13,7 +11,7 @@ mod schema;
pub mod service;
mod tenant_state;
#[derive(Clone, Serialize, Deserialize, Debug)]
#[derive(Clone, Serialize, Deserialize)]
enum PlacementPolicy {
/// Cheapest way to attach a tenant: just one pageserver, no secondary
Single,
@@ -24,7 +22,7 @@ enum PlacementPolicy {
Detached,
}
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
struct Sequence(u64);
impl Sequence {
@@ -39,12 +37,6 @@ impl std::fmt::Display for Sequence {
}
}
impl std::fmt::Debug for Sequence {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl MonotonicCounter<Sequence> for Sequence {
fn cnt_advance(&mut self, v: Sequence) {
assert!(*self <= v);

View File

@@ -6,7 +6,6 @@
///
use anyhow::{anyhow, Context};
use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service};
use aws_config::{self, BehaviorVersion, Region};
@@ -16,7 +15,6 @@ use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use std::sync::Arc;
use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken;
use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat};
@@ -206,8 +204,6 @@ async fn async_main() -> anyhow::Result<()> {
logging::Output::Stdout,
)?;
preinitialize_metrics();
let args = Cli::parse();
tracing::info!(
"version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
@@ -241,23 +237,15 @@ async fn async_main() -> anyhow::Result<()> {
let auth = secrets
.public_key
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
let router = make_router(service.clone(), auth)
let router = make_router(service, auth)
.build()
.map_err(|err| anyhow!(err))?;
let router_service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
// Start HTTP server
let server_shutdown = CancellationToken::new();
let server = hyper::Server::from_tcp(http_listener)?
.serve(router_service)
.with_graceful_shutdown({
let server_shutdown = server_shutdown.clone();
async move {
server_shutdown.cancelled().await;
}
});
tracing::info!("Serving on {0}", args.listen);
let server_task = tokio::task::spawn(server);
tokio::task::spawn(server);
// Wait until we receive a signal
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
@@ -278,16 +266,5 @@ async fn async_main() -> anyhow::Result<()> {
}
}
// Stop HTTP server first, so that we don't have to service requests
// while shutting down Service
server_shutdown.cancel();
if let Err(e) = server_task.await {
tracing::error!("Error joining HTTP server task: {e}")
}
tracing::info!("Joined HTTP server task");
service.shutdown().await;
tracing::info!("Service shutdown complete");
std::process::exit(0);
}

View File

@@ -1,32 +0,0 @@
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
pub(crate) struct ReconcilerMetrics {
pub(crate) spawned: IntCounter,
pub(crate) complete: IntCounterVec,
}
impl ReconcilerMetrics {
// Labels used on [`Self::complete`]
pub(crate) const SUCCESS: &'static str = "ok";
pub(crate) const ERROR: &'static str = "success";
pub(crate) const CANCEL: &'static str = "cancel";
}
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
spawned: register_int_counter!(
"storage_controller_reconcile_spawn",
"Count of how many times we spawn a reconcile task",
)
.expect("failed to define a metric"),
complete: register_int_counter_vec!(
"storage_controller_reconcile_complete",
"Reconciler tasks completed, broken down by success/failure/cancelled",
&["status"],
)
.expect("failed to define a metric"),
});
pub fn preinitialize_metrics() {
Lazy::force(&RECONCILER);
}

View File

@@ -1,16 +1,9 @@
use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
use serde::Serialize;
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use utils::id::NodeId;
use crate::persistence::NodePersistence;
/// Represents the in-memory description of a Node.
///
/// Scheduling statistics are maintened separately in [`crate::scheduler`].
///
/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
/// implementation of serialization on this type is only for debug dumps.
#[derive(Clone, Serialize)]
#[derive(Clone)]
pub(crate) struct Node {
pub(crate) id: NodeId,

View File

@@ -6,10 +6,10 @@ use std::time::Duration;
use self::split_state::SplitState;
use camino::Utf8Path;
use camino::Utf8PathBuf;
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use diesel::pg::PgConnection;
use diesel::prelude::*;
use diesel::Connection;
use pageserver_api::controller_api::NodeSchedulingPolicy;
use pageserver_api::models::TenantConfig;
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
use serde::{Deserialize, Serialize};
@@ -130,10 +130,24 @@ impl Persistence {
}
/// At startup, populate the list of nodes which our shards may be placed on
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
let nodes: Vec<NodePersistence> = self
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
let nodes: Vec<Node> = self
.with_conn(move |conn| -> DatabaseResult<_> {
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
Ok(crate::schema::nodes::table
.load::<NodePersistence>(conn)?
.into_iter()
.map(|n| Node {
id: NodeId(n.node_id as u64),
// At startup we consider a node offline until proven otherwise.
availability: NodeAvailability::Offline,
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
.expect("Bad scheduling policy in DB"),
listen_http_addr: n.listen_http_addr,
listen_http_port: n.listen_http_port as u16,
listen_pg_addr: n.listen_pg_addr,
listen_pg_port: n.listen_pg_port as u16,
})
.collect::<Vec<Node>>())
})
.await?;
@@ -142,31 +156,6 @@ impl Persistence {
Ok(nodes)
}
pub(crate) async fn update_node(
&self,
input_node_id: NodeId,
input_scheduling: NodeSchedulingPolicy,
) -> DatabaseResult<()> {
use crate::schema::nodes::dsl::*;
let updated = self
.with_conn(move |conn| {
let updated = diesel::update(nodes)
.filter(node_id.eq(input_node_id.0 as i64))
.set((scheduling_policy.eq(String::from(input_scheduling)),))
.execute(conn)?;
Ok(updated)
})
.await?;
if updated != 1 {
Err(DatabaseError::Logical(format!(
"Node {node_id:?} not found for update",
)))
} else {
Ok(())
}
}
/// At startup, load the high level state for shards, such as their config + policy. This will
/// be enriched at runtime with state discovered on pageservers.
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -233,7 +222,7 @@ impl Persistence {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount::new(tsp.shard_count as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
tenants_map.insert(tenant_shard_id, tsp);
@@ -329,7 +318,7 @@ impl Persistence {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount::new(tsp.shard_count as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
}
@@ -351,7 +340,7 @@ impl Persistence {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
.set((
generation.eq(generation + 1),
generation_pageserver.eq(node_id.0 as i64),
@@ -373,7 +362,7 @@ impl Persistence {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
.set((
generation_pageserver.eq(i64::MAX),
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
@@ -392,6 +381,7 @@ impl Persistence {
//
// We create the child shards here, so that they will be available for increment_generation calls
// if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
#[allow(dead_code)]
pub(crate) async fn begin_shard_split(
&self,
old_shard_count: ShardCount,
@@ -403,19 +393,21 @@ impl Persistence {
conn.transaction(|conn| -> DatabaseResult<()> {
// Mark parent shards as splitting
let expect_parent_records = std::cmp::max(1, old_shard_count.0);
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(old_shard_count.literal() as i32))
.filter(shard_count.eq(old_shard_count.0 as i32))
.set((splitting.eq(1),))
.execute(conn)?;
if u8::try_from(updated)
.map_err(|_| DatabaseError::Logical(
format!("Overflow existing shard count {} while splitting", updated))
)? != old_shard_count.count() {
)? != expect_parent_records {
// Perhaps a deletion or another split raced with this attempt to split, mutating
// the parent shards that we intend to split. In this case the split request should fail.
return Err(DatabaseError::Logical(
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
));
}
@@ -427,7 +419,7 @@ impl Persistence {
let mut parent = crate::schema::tenant_shards::table
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
.filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
.load::<TenantShardPersistence>(conn)?;
let parent = if parent.len() != 1 {
return Err(DatabaseError::Logical(format!(
@@ -457,6 +449,7 @@ impl Persistence {
// When we finish shard splitting, we must atomically clean up the old shards
// and insert the new shards, and clear the splitting marker.
#[allow(dead_code)]
pub(crate) async fn complete_shard_split(
&self,
split_tenant_id: TenantId,
@@ -468,7 +461,7 @@ impl Persistence {
// Drop parent shards
diesel::delete(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(old_shard_count.literal() as i32))
.filter(shard_count.eq(old_shard_count.0 as i32))
.execute(conn)?;
// Clear sharding flag
@@ -488,7 +481,7 @@ impl Persistence {
}
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
#[diesel(table_name = crate::schema::tenant_shards)]
pub(crate) struct TenantShardPersistence {
#[serde(default)]
@@ -517,7 +510,7 @@ pub(crate) struct TenantShardPersistence {
}
/// Parts of [`crate::node::Node`] that are stored durably
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
#[diesel(table_name = crate::schema::nodes)]
pub(crate) struct NodePersistence {
pub(crate) node_id: i64,

View File

@@ -1,6 +1,6 @@
use crate::persistence::Persistence;
use crate::service;
use pageserver_api::controller_api::NodeAvailability;
use control_plane::attachment_service::NodeAvailability;
use pageserver_api::models::{
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
};
@@ -13,7 +13,6 @@ use tokio_util::sync::CancellationToken;
use utils::generation::Generation;
use utils::id::{NodeId, TimelineId};
use utils::lsn::Lsn;
use utils::sync::gate::GateGuard;
use crate::compute_hook::{ComputeHook, NotifyError};
use crate::node::Node;
@@ -27,7 +26,7 @@ pub(super) struct Reconciler {
pub(super) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity,
pub(crate) generation: Generation,
pub(crate) intent: TargetState,
pub(crate) intent: IntentState,
pub(crate) config: TenantConfig,
pub(crate) observed: ObservedState,
@@ -54,46 +53,14 @@ pub(super) struct Reconciler {
/// the tenant is changed.
pub(crate) cancel: CancellationToken,
/// Reconcilers are registered with a Gate so that during a graceful shutdown we
/// can wait for all the reconcilers to respond to their cancellation tokens.
pub(crate) _gate_guard: GateGuard,
/// Access to persistent storage for updating generation numbers
pub(crate) persistence: Arc<Persistence>,
}
/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
/// and the TargetState is just the instruction for a particular Reconciler run.
#[derive(Debug)]
pub(crate) struct TargetState {
pub(crate) attached: Option<NodeId>,
pub(crate) secondary: Vec<NodeId>,
}
impl TargetState {
pub(crate) fn from_intent(intent: &IntentState) -> Self {
Self {
attached: *intent.get_attached(),
secondary: intent.get_secondary().clone(),
}
}
fn all_pageservers(&self) -> Vec<NodeId> {
let mut result = self.secondary.clone();
if let Some(node_id) = &self.attached {
result.push(*node_id);
}
result
}
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum ReconcileError {
#[error(transparent)]
Notify(#[from] NotifyError),
#[error("Cancelled")]
Cancel,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
@@ -296,7 +263,7 @@ impl Reconciler {
secondary_conf,
tenant_conf: config.clone(),
shard_number: shard.number.0,
shard_count: shard.count.literal(),
shard_count: shard.count.0,
shard_stripe_size: shard.stripe_size.0,
}
}
@@ -438,7 +405,7 @@ impl Reconciler {
match self.observed.locations.get(&node_id) {
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
// Nothing to do
tracing::info!(%node_id, "Observed configuration already correct.")
tracing::info!("Observed configuration already correct.")
}
_ => {
// In all cases other than a matching observed configuration, we will
@@ -449,7 +416,7 @@ impl Reconciler {
.increment_generation(self.tenant_shard_id, node_id)
.await?;
wanted_conf.generation = self.generation.into();
tracing::info!(%node_id, "Observed configuration requires update.");
tracing::info!("Observed configuration requires update.");
self.location_config(node_id, wanted_conf, None).await?;
self.compute_notify().await?;
}
@@ -491,7 +458,7 @@ impl Reconciler {
generation: None,
secondary_conf: None,
shard_number: self.shard.number.0,
shard_count: self.shard.count.literal(),
shard_count: self.shard.count.0,
shard_stripe_size: self.shard.stripe_size.0,
tenant_conf: self.config.clone(),
},
@@ -499,9 +466,6 @@ impl Reconciler {
}
for (node_id, conf) in changes {
if self.cancel.is_cancelled() {
return Err(ReconcileError::Cancel);
}
self.location_config(node_id, conf, None).await?;
}
@@ -542,7 +506,7 @@ pub(crate) fn attached_location_conf(
generation: generation.into(),
secondary_conf: None,
shard_number: shard.number.0,
shard_count: shard.count.literal(),
shard_count: shard.count.0,
shard_stripe_size: shard.stripe_size.0,
tenant_conf: config.clone(),
}
@@ -557,7 +521,7 @@ pub(crate) fn secondary_location_conf(
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: shard.number.0,
shard_count: shard.count.literal(),
shard_count: shard.count.0,
shard_stripe_size: shard.stripe_size.0,
tenant_conf: config.clone(),
}

View File

@@ -1,8 +1,9 @@
use crate::{node::Node, tenant_state::TenantState};
use serde::Serialize;
use std::collections::HashMap;
use pageserver_api::shard::TenantShardId;
use std::collections::{BTreeMap, HashMap};
use utils::{http::error::ApiError, id::NodeId};
use crate::{node::Node, tenant_state::TenantState};
/// Scenarios in which we cannot find a suitable location for a tenant shard
#[derive(thiserror::Error, Debug)]
pub enum ScheduleError {
@@ -18,203 +19,52 @@ impl From<ScheduleError> for ApiError {
}
}
#[derive(Serialize, Eq, PartialEq)]
struct SchedulerNode {
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
shard_count: usize,
/// Whether this node is currently elegible to have new shards scheduled (this is derived
/// from a node's availability state and scheduling policy).
may_schedule: bool,
}
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
/// on which to run.
///
/// The type has no persistent state of its own: this is all populated at startup. The Serialize
/// impl is only for debug dumps.
#[derive(Serialize)]
pub(crate) struct Scheduler {
nodes: HashMap<NodeId, SchedulerNode>,
tenant_counts: HashMap<NodeId, usize>,
}
impl Scheduler {
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
let mut scheduler_nodes = HashMap::new();
for node in nodes {
scheduler_nodes.insert(
node.id,
SchedulerNode {
shard_count: 0,
may_schedule: node.may_schedule(),
},
);
pub(crate) fn new(
tenants: &BTreeMap<TenantShardId, TenantState>,
nodes: &HashMap<NodeId, Node>,
) -> Self {
let mut tenant_counts = HashMap::new();
for node_id in nodes.keys() {
tenant_counts.insert(*node_id, 0);
}
Self {
nodes: scheduler_nodes,
}
}
/// For debug/support: check that our internal statistics are in sync with the state of
/// the nodes & tenant shards.
///
/// If anything is inconsistent, log details and return an error.
pub(crate) fn consistency_check<'a>(
&self,
nodes: impl Iterator<Item = &'a Node>,
shards: impl Iterator<Item = &'a TenantState>,
) -> anyhow::Result<()> {
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
for node in nodes {
expect_nodes.insert(
node.id,
SchedulerNode {
shard_count: 0,
may_schedule: node.may_schedule(),
},
);
}
for shard in shards {
if let Some(node_id) = shard.intent.get_attached() {
match expect_nodes.get_mut(node_id) {
Some(node) => node.shard_count += 1,
None => anyhow::bail!(
"Tenant {} references nonexistent node {}",
shard.tenant_shard_id,
node_id
),
}
}
for node_id in shard.intent.get_secondary() {
match expect_nodes.get_mut(node_id) {
Some(node) => node.shard_count += 1,
None => anyhow::bail!(
"Tenant {} references nonexistent node {}",
shard.tenant_shard_id,
node_id
),
}
for tenant in tenants.values() {
if let Some(ps) = tenant.intent.attached {
let entry = tenant_counts.entry(ps).or_insert(0);
*entry += 1;
}
}
for (node_id, expect_node) in &expect_nodes {
let Some(self_node) = self.nodes.get(node_id) else {
anyhow::bail!("Node {node_id} not found in Self")
};
if self_node != expect_node {
tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
anyhow::bail!("Inconsistent state on {node_id}");
for (node_id, node) in nodes {
if !node.may_schedule() {
tenant_counts.remove(node_id);
}
}
if expect_nodes.len() != self.nodes.len() {
// We just checked that all the expected nodes are present. If the lengths don't match,
// it means that we have nodes in Self that are unexpected.
for node_id in self.nodes.keys() {
if !expect_nodes.contains_key(node_id) {
anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
}
}
}
Ok(())
Self { tenant_counts }
}
/// Increment the reference count of a node. This reference count is used to guide scheduling
/// decisions, not for memory management: it represents one tenant shard whose IntentState targets
/// this node.
///
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
/// [`Self::new`] or [`Self::node_upsert`])
pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
let Some(node) = self.nodes.get_mut(&node_id) else {
tracing::error!("Scheduler missing node {node_id}");
debug_assert!(false);
return;
};
node.shard_count += 1;
}
/// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`].
pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
let Some(node) = self.nodes.get_mut(&node_id) else {
debug_assert!(false);
tracing::error!("Scheduler missing node {node_id}");
return;
};
node.shard_count -= 1;
}
pub(crate) fn node_upsert(&mut self, node: &Node) {
use std::collections::hash_map::Entry::*;
match self.nodes.entry(node.id) {
Occupied(mut entry) => {
entry.get_mut().may_schedule = node.may_schedule();
}
Vacant(entry) => {
entry.insert(SchedulerNode {
shard_count: 0,
may_schedule: node.may_schedule(),
});
}
}
}
pub(crate) fn node_remove(&mut self, node_id: NodeId) {
if self.nodes.remove(&node_id).is_none() {
tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
}
}
/// Where we have several nodes to choose from, for example when picking a secondary location
/// to promote to an attached location, this method may be used to pick the best choice based
/// on the scheduler's knowledge of utilization and availability.
///
/// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
/// caller can pick a node some other way.
pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
if nodes.is_empty() {
return None;
}
let node = nodes
.iter()
.map(|node_id| {
let may_schedule = self
.nodes
.get(node_id)
.map(|n| n.may_schedule)
.unwrap_or(false);
(*node_id, may_schedule)
})
.max_by_key(|(_n, may_schedule)| *may_schedule);
// If even the preferred node has may_schedule==false, return None
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
}
pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
if self.nodes.is_empty() {
pub(crate) fn schedule_shard(
&mut self,
hard_exclude: &[NodeId],
) -> Result<NodeId, ScheduleError> {
if self.tenant_counts.is_empty() {
return Err(ScheduleError::NoPageservers);
}
let mut tenant_counts: Vec<(NodeId, usize)> = self
.nodes
.tenant_counts
.iter()
.filter_map(|(k, v)| {
if hard_exclude.contains(k) || !v.may_schedule {
if hard_exclude.contains(k) {
None
} else {
Some((*k, v.shard_count))
Some((*k, *v))
}
})
.collect();
@@ -223,109 +73,17 @@ impl Scheduler {
tenant_counts.sort_by_key(|i| (i.1, i.0));
if tenant_counts.is_empty() {
// After applying constraints, no pageservers were left. We log some detail about
// the state of nodes to help understand why this happened. This is not logged as an error because
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
for (node_id, node) in &self.nodes {
tracing::info!(
"Node {node_id}: may_schedule={} shards={}",
node.may_schedule,
node.shard_count
);
}
// After applying constraints, no pageservers were left
return Err(ScheduleError::ImpossibleConstraint);
}
for (node_id, count) in &tenant_counts {
tracing::info!("tenant_counts[{node_id}]={count}");
}
let node_id = tenant_counts.first().unwrap().0;
tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
);
// Note that we do not update shard count here to reflect the scheduling: that
// is IntentState's job when the scheduled location is used.
tracing::info!("scheduler selected node {node_id}");
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
Ok(node_id)
}
}
#[cfg(test)]
pub(crate) mod test_utils {
use crate::node::Node;
use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
use std::collections::HashMap;
use utils::id::NodeId;
/// Test helper: synthesize the requested number of nodes, all in active state.
///
/// Node IDs start at one.
pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
(1..n + 1)
.map(|i| {
(
NodeId(i),
Node {
id: NodeId(i),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
listen_http_addr: format!("httphost-{i}"),
listen_http_port: 80 + i as u16,
listen_pg_addr: format!("pghost-{i}"),
listen_pg_port: 5432 + i as u16,
},
)
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use utils::id::NodeId;
use crate::tenant_state::IntentState;
#[test]
fn scheduler_basic() -> anyhow::Result<()> {
let nodes = test_utils::make_test_nodes(2);
let mut scheduler = Scheduler::new(nodes.values());
let mut t1_intent = IntentState::new();
let mut t2_intent = IntentState::new();
let scheduled = scheduler.schedule_shard(&[])?;
t1_intent.set_attached(&mut scheduler, Some(scheduled));
let scheduled = scheduler.schedule_shard(&[])?;
t2_intent.set_attached(&mut scheduler, Some(scheduled));
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
t1_intent.push_secondary(&mut scheduler, scheduled);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
t1_intent.clear(&mut scheduler);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
if cfg!(debug_assertions) {
// Dropping an IntentState without clearing it causes a panic in debug mode,
// because we have failed to properly update scheduler shard counts.
let result = std::panic::catch_unwind(move || {
drop(t2_intent);
});
assert!(result.is_err());
} else {
t2_intent.clear(&mut scheduler);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
}
Ok(())
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,47 +1,27 @@
use std::{collections::HashMap, sync::Arc, time::Duration};
use crate::{metrics, persistence::TenantShardPersistence};
use pageserver_api::controller_api::NodeAvailability;
use control_plane::attachment_service::NodeAvailability;
use pageserver_api::{
models::{LocationConfig, LocationConfigMode, TenantConfig},
shard::{ShardIdentity, TenantShardId},
};
use serde::Serialize;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use tracing::{instrument, Instrument};
use utils::{
generation::Generation,
id::NodeId,
seqwait::{SeqWait, SeqWaitError},
sync::gate::Gate,
};
use crate::{
compute_hook::ComputeHook,
node::Node,
persistence::{split_state::SplitState, Persistence},
reconciler::{
attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
},
persistence::Persistence,
reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
scheduler::{ScheduleError, Scheduler},
service, PlacementPolicy, Sequence,
};
/// Serialization helper
fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
T: Clone + std::fmt::Display,
{
serializer.collect_str(&v.lock().unwrap())
}
/// In-memory state for a particular tenant shard.
///
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
#[derive(Serialize)]
pub(crate) struct TenantState {
pub(crate) tenant_shard_id: TenantShardId,
@@ -76,29 +56,20 @@ pub(crate) struct TenantState {
/// If a reconcile task is currently in flight, it may be joined here (it is
/// only safe to join if either the result has been received or the reconciler's
/// cancellation token has been fired)
#[serde(skip)]
pub(crate) reconciler: Option<ReconcilerHandle>,
/// If a tenant is being split, then all shards with that TenantId will have a
/// SplitState set, this acts as a guard against other operations such as background
/// reconciliation, and timeline creation.
pub(crate) splitting: SplitState,
/// Optionally wait for reconciliation to complete up to a particular
/// sequence number.
#[serde(skip)]
pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
/// Indicates sequence number for which we have encountered an error reconciling. If
/// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
/// and callers should stop waiting for `waiter` and propagate the error.
#[serde(skip)]
pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
/// The most recent error from a reconcile on this tenant
/// TODO: generalize to an array of recent events
/// TOOD: use a ArcSwap instead of mutex for faster reads?
#[serde(serialize_with = "read_mutex_content")]
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
/// If we have a pending compute notification that for some reason we weren't able to send,
@@ -108,131 +79,13 @@ pub(crate) struct TenantState {
pub(crate) pending_compute_notification: bool,
}
#[derive(Default, Clone, Debug, Serialize)]
#[derive(Default, Clone, Debug)]
pub(crate) struct IntentState {
attached: Option<NodeId>,
secondary: Vec<NodeId>,
pub(crate) attached: Option<NodeId>,
pub(crate) secondary: Vec<NodeId>,
}
impl IntentState {
pub(crate) fn new() -> Self {
Self {
attached: None,
secondary: vec![],
}
}
pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
if let Some(node_id) = node_id {
scheduler.node_inc_ref(node_id);
}
Self {
attached: node_id,
secondary: vec![],
}
}
pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
if self.attached != new_attached {
if let Some(old_attached) = self.attached.take() {
scheduler.node_dec_ref(old_attached);
}
if let Some(new_attached) = &new_attached {
scheduler.node_inc_ref(*new_attached);
}
self.attached = new_attached;
}
}
/// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from
/// secondary to attached while maintaining the scheduler's reference counts.
pub(crate) fn promote_attached(
&mut self,
_scheduler: &mut Scheduler,
promote_secondary: NodeId,
) {
// If we call this with a node that isn't in secondary, it would cause incorrect
// scheduler reference counting, since we assume the node is already referenced as a secondary.
debug_assert!(self.secondary.contains(&promote_secondary));
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
// need to call into it here.
self.secondary.retain(|n| n != &promote_secondary);
self.attached = Some(promote_secondary);
}
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
debug_assert!(!self.secondary.contains(&new_secondary));
scheduler.node_inc_ref(new_secondary);
self.secondary.push(new_secondary);
}
/// It is legal to call this with a node that is not currently a secondary: that is a no-op
pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
let index = self.secondary.iter().position(|n| *n == node_id);
if let Some(index) = index {
scheduler.node_dec_ref(node_id);
self.secondary.remove(index);
}
}
pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
for secondary in self.secondary.drain(..) {
scheduler.node_dec_ref(secondary);
}
}
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
if let Some(old_attached) = self.attached.take() {
scheduler.node_dec_ref(old_attached);
}
self.clear_secondary(scheduler);
}
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
let mut result = Vec::new();
if let Some(p) = self.attached {
result.push(p)
}
result.extend(self.secondary.iter().copied());
result
}
pub(crate) fn get_attached(&self) -> &Option<NodeId> {
&self.attached
}
pub(crate) fn get_secondary(&self) -> &Vec<NodeId> {
&self.secondary
}
/// When a node goes offline, we update intents to avoid using it
/// as their attached pageserver.
///
/// Returns true if a change was made
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
if self.attached == Some(node_id) {
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
// need to call into it here.
self.attached = None;
self.secondary.push(node_id);
true
} else {
false
}
}
}
impl Drop for IntentState {
fn drop(&mut self) {
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
}
}
#[derive(Default, Clone, Serialize)]
#[derive(Default, Clone)]
pub(crate) struct ObservedState {
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
}
@@ -246,7 +99,7 @@ pub(crate) struct ObservedState {
/// what it is (e.g. we failed partway through configuring it)
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
/// and that configuration will still be present unless something external interfered.
#[derive(Clone, Serialize)]
#[derive(Clone)]
pub(crate) struct ObservedStateLocation {
/// If None, it means we do not know the status of this shard's location on this node, but
/// we know that we might have some state on this node.
@@ -322,6 +175,46 @@ pub(crate) struct ReconcileResult {
pub(crate) pending_compute_notification: bool,
}
impl IntentState {
pub(crate) fn new() -> Self {
Self {
attached: None,
secondary: vec![],
}
}
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
let mut result = Vec::new();
if let Some(p) = self.attached {
result.push(p)
}
result.extend(self.secondary.iter().copied());
result
}
pub(crate) fn single(node_id: Option<NodeId>) -> Self {
Self {
attached: node_id,
secondary: vec![],
}
}
/// When a node goes offline, we update intents to avoid using it
/// as their attached pageserver.
///
/// Returns true if a change was made
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
if self.attached == Some(node_id) {
self.attached = None;
self.secondary.push(node_id);
true
} else {
false
}
}
}
impl ObservedState {
pub(crate) fn new() -> Self {
Self {
@@ -345,7 +238,6 @@ impl TenantState {
observed: ObservedState::default(),
config: TenantConfig::default(),
reconciler: None,
splitting: SplitState::Idle,
sequence: Sequence(1),
waiter: Arc::new(SeqWait::new(Sequence(0))),
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
@@ -389,9 +281,6 @@ impl TenantState {
// All remaining observed locations generate secondary intents. This includes None
// observations, as these may well have some local content on disk that is usable (this
// is an edge case that might occur if we restarted during a migration or other change)
//
// We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`]
// will take care of promoting one of these secondaries to be attached.
self.observed.locations.keys().for_each(|node_id| {
if Some(*node_id) != self.intent.attached {
self.intent.secondary.push(*node_id);
@@ -399,33 +288,6 @@ impl TenantState {
});
}
/// Part of [`Self::schedule`] that is used to choose exactly one node to act as the
/// attached pageserver for a shard.
///
/// Returns whether we modified it, and the NodeId selected.
fn schedule_attached(
&mut self,
scheduler: &mut Scheduler,
) -> Result<(bool, NodeId), ScheduleError> {
// No work to do if we already have an attached tenant
if let Some(node_id) = self.intent.attached {
return Ok((false, node_id));
}
if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
// Promote a secondary
tracing::debug!("Promoted secondary {} to attached", promote_secondary);
self.intent.promote_attached(scheduler, promote_secondary);
Ok((true, promote_secondary))
} else {
// Pick a fresh node: either we had no secondaries or none were schedulable
let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
tracing::debug!("Selected {} as attached", node_id);
self.intent.set_attached(scheduler, Some(node_id));
Ok((true, node_id))
}
}
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
// TODO: before scheduling new nodes, check if any existing content in
// self.intent refers to pageservers that are offline, and pick other
@@ -436,29 +298,36 @@ impl TenantState {
// Build the set of pageservers already in use by this tenant, to avoid scheduling
// more work on the same pageservers we're already using.
let mut used_pageservers = self.intent.all_pageservers();
let mut modified = false;
use PlacementPolicy::*;
match self.policy {
Single => {
// Should have exactly one attached, and zero secondaries
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
modified |= modified_attached;
if self.intent.attached.is_none() {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.attached = Some(node_id);
used_pageservers.push(node_id);
modified = true;
}
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
self.intent.secondary.clear();
modified = true;
}
}
Double(secondary_count) => {
// Should have exactly one attached, and N secondaries
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
modified |= modified_attached;
if self.intent.attached.is_none() {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.attached = Some(node_id);
used_pageservers.push(node_id);
modified = true;
}
let mut used_pageservers = vec![attached_node_id];
while self.intent.secondary.len() < secondary_count {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.push_secondary(scheduler, node_id);
self.intent.secondary.push(node_id);
used_pageservers.push(node_id);
modified = true;
}
@@ -466,12 +335,12 @@ impl TenantState {
Detached => {
// Should have no attached or secondary pageservers
if self.intent.attached.is_some() {
self.intent.set_attached(scheduler, None);
self.intent.attached = None;
modified = true;
}
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
self.intent.secondary.clear();
modified = true;
}
}
@@ -537,13 +406,6 @@ impl TenantState {
}
}
for node_id in self.observed.locations.keys() {
if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
// We have observed state that isn't part of our intent: need to clean it up.
return true;
}
}
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
// wake up a reconciler to send it.
if self.pending_compute_notification {
@@ -553,8 +415,6 @@ impl TenantState {
false
}
#[allow(clippy::too_many_arguments)]
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn maybe_reconcile(
&mut self,
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
@@ -562,8 +422,6 @@ impl TenantState {
compute_hook: &Arc<ComputeHook>,
service_config: &service::Config,
persistence: &Arc<Persistence>,
gate: &Gate,
cancel: &CancellationToken,
) -> Option<ReconcilerWaiter> {
// If there are any ambiguous observed states, and the nodes they refer to are available,
// we should reconcile to clean them up.
@@ -585,14 +443,6 @@ impl TenantState {
return None;
}
// If we are currently splitting, then never start a reconciler task: the splitting logic
// requires that shards are not interfered with while it runs. Do this check here rather than
// up top, so that we only log this message if we would otherwise have done a reconciliation.
if !matches!(self.splitting, SplitState::Idle) {
tracing::info!("Refusing to reconcile, splitting in progress");
return None;
}
// Reconcile already in flight for the current sequence?
if let Some(handle) = &self.reconciler {
if handle.sequence == self.sequence {
@@ -610,101 +460,70 @@ impl TenantState {
// doing our sequence's work.
let old_handle = self.reconciler.take();
let Ok(gate_guard) = gate.enter() else {
// Shutting down, don't start a reconciler
return None;
};
let reconciler_cancel = cancel.child_token();
let cancel = CancellationToken::new();
let mut reconciler = Reconciler {
tenant_shard_id: self.tenant_shard_id,
shard: self.shard,
generation: self.generation,
intent: TargetState::from_intent(&self.intent),
intent: self.intent.clone(),
config: self.config.clone(),
observed: self.observed.clone(),
pageservers: pageservers.clone(),
compute_hook: compute_hook.clone(),
service_config: service_config.clone(),
_gate_guard: gate_guard,
cancel: reconciler_cancel.clone(),
cancel: cancel.clone(),
persistence: persistence.clone(),
compute_notify_failure: false,
};
let reconcile_seq = self.sequence;
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
let must_notify = self.pending_compute_notification;
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
tenant_id=%reconciler.tenant_shard_id.tenant_id,
shard_id=%reconciler.tenant_shard_id.shard_slug());
metrics::RECONCILER.spawned.inc();
let join_handle = tokio::task::spawn(
async move {
// Wait for any previous reconcile task to complete before we start
if let Some(old_handle) = old_handle {
old_handle.cancel.cancel();
if let Err(e) = old_handle.handle.await {
// We can't do much with this other than log it: the task is done, so
// we may proceed with our work.
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
}
let join_handle = tokio::task::spawn(async move {
// Wait for any previous reconcile task to complete before we start
if let Some(old_handle) = old_handle {
old_handle.cancel.cancel();
if let Err(e) = old_handle.handle.await {
// We can't do much with this other than log it: the task is done, so
// we may proceed with our work.
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
}
// Early check for cancellation before doing any work
// TODO: wrap all remote API operations in cancellation check
// as well.
if reconciler.cancel.is_cancelled() {
metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
.inc();
return;
}
// Attempt to make observed state match intent state
let result = reconciler.reconcile().await;
// If we know we had a pending compute notification from some previous action, send a notification irrespective
// of whether the above reconcile() did any work
if result.is_ok() && must_notify {
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
reconciler.compute_notify().await.ok();
}
// Update result counter
match &result {
Ok(_) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
Err(ReconcileError::Cancel) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
Err(_) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
}
.inc();
result_tx
.send(ReconcileResult {
sequence: reconcile_seq,
result,
tenant_shard_id: reconciler.tenant_shard_id,
generation: reconciler.generation,
observed: reconciler.observed,
pending_compute_notification: reconciler.compute_notify_failure,
})
.ok();
}
.instrument(reconciler_span),
);
// Early check for cancellation before doing any work
// TODO: wrap all remote API operations in cancellation check
// as well.
if reconciler.cancel.is_cancelled() {
return;
}
// Attempt to make observed state match intent state
let result = reconciler.reconcile().await;
// If we know we had a pending compute notification from some previous action, send a notification irrespective
// of whether the above reconcile() did any work
if result.is_ok() && must_notify {
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
reconciler.compute_notify().await.ok();
}
result_tx
.send(ReconcileResult {
sequence: reconcile_seq,
result,
tenant_shard_id: reconciler.tenant_shard_id,
generation: reconciler.generation,
observed: reconciler.observed,
pending_compute_notification: reconciler.compute_notify_failure,
})
.ok();
});
self.reconciler = Some(ReconcilerHandle {
sequence: self.sequence,
handle: join_handle,
cancel: reconciler_cancel,
cancel,
});
Some(ReconcilerWaiter {
@@ -729,103 +548,4 @@ impl TenantState {
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
}
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
TenantShardPersistence {
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
shard_number: self.tenant_shard_id.shard_number.0 as i32,
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: self.shard.stripe_size.0 as i32,
generation: self.generation.into().unwrap_or(0) as i32,
generation_pageserver: self
.intent
.get_attached()
.map(|n| n.0 as i64)
.unwrap_or(i64::MAX),
placement_policy: serde_json::to_string(&self.policy).unwrap(),
config: serde_json::to_string(&self.config).unwrap(),
splitting: SplitState::default(),
}
}
}
#[cfg(test)]
pub(crate) mod tests {
use pageserver_api::shard::{ShardCount, ShardNumber};
use utils::id::TenantId;
use crate::scheduler::test_utils::make_test_nodes;
use super::*;
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
let tenant_id = TenantId::generate();
let shard_number = ShardNumber(0);
let shard_count = ShardCount::new(1);
let tenant_shard_id = TenantShardId {
tenant_id,
shard_number,
shard_count,
};
TenantState::new(
tenant_shard_id,
ShardIdentity::new(
shard_number,
shard_count,
pageserver_api::shard::ShardStripeSize(32768),
)
.unwrap(),
policy,
)
}
/// Test the scheduling behaviors used when a tenant configured for HA is subject
/// to nodes being marked offline.
#[test]
fn tenant_ha_scheduling() -> anyhow::Result<()> {
// Start with three nodes. Our tenant will only use two. The third one is
// expected to remain unused.
let mut nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
tenant_state
.schedule(&mut scheduler)
.expect("we have enough nodes, scheduling should work");
// Expect to initially be schedule on to different nodes
assert_eq!(tenant_state.intent.secondary.len(), 1);
assert!(tenant_state.intent.attached.is_some());
let attached_node_id = tenant_state.intent.attached.unwrap();
let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
assert_ne!(attached_node_id, secondary_node_id);
// Notifying the attached node is offline should demote it to a secondary
let changed = tenant_state.intent.notify_offline(attached_node_id);
assert!(changed);
// Update the scheduler state to indicate the node is offline
nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
// Scheduling the node should promote the still-available secondary node to attached
tenant_state
.schedule(&mut scheduler)
.expect("active nodes are available");
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
// The original attached node should have been retained as a secondary
assert_eq!(
*tenant_state.intent.secondary.iter().last().unwrap(),
attached_node_id
);
tenant_state.intent.clear(&mut scheduler);
Ok(())
}
}

View File

@@ -2,12 +2,8 @@ use crate::{background_process, local_env::LocalEnv};
use camino::{Utf8Path, Utf8PathBuf};
use hyper::Method;
use pageserver_api::{
controller_api::{
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
TenantShardMigrateRequest, TenantShardMigrateResponse,
},
models::{
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
TimelineCreateRequest, TimelineInfo,
},
shard::TenantShardId,
@@ -15,12 +11,12 @@ use pageserver_api::{
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{fs, str::FromStr};
use std::str::FromStr;
use tokio::process::Command;
use tracing::instrument;
use url::Url;
use utils::{
auth::{encode_from_key_file, Claims, Scope},
auth::{Claims, Scope},
id::{NodeId, TenantId},
};
@@ -28,7 +24,7 @@ pub struct AttachmentService {
env: LocalEnv,
listen: String,
path: Utf8PathBuf,
private_key: Option<Vec<u8>>,
jwt_token: Option<String>,
public_key: Option<String>,
postgres_port: u16,
client: reqwest::Client,
@@ -59,6 +55,126 @@ pub struct InspectResponse {
pub attachment: Option<(u32, NodeId)>,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub generation: u32,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponse {
pub shards: Vec<TenantCreateResponseShard>,
}
#[derive(Serialize, Deserialize)]
pub struct NodeRegisterRequest {
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_http_addr: String,
pub listen_http_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct NodeConfigureRequest {
pub node_id: NodeId,
pub availability: Option<NodeAvailability>,
pub scheduling: Option<NodeSchedulingPolicy>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantLocateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_http_addr: String,
pub listen_http_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct TenantLocateResponse {
pub shards: Vec<TenantLocateResponseShard>,
pub shard_params: ShardParameters,
}
/// Explicitly migrating a particular shard is a low level operation
/// TODO: higher level "Reschedule tenant" operation where the request
/// specifies some constraints, e.g. asking it to get off particular node(s)
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: NodeId,
}
#[derive(Serialize, Deserialize, Clone, Copy)]
pub enum NodeAvailability {
// Normal, happy state
Active,
// Offline: Tenants shouldn't try to attach here, but they may assume that their
// secondary locations on this node still exist. Newly added nodes are in this
// state until we successfully contact them.
Offline,
}
impl FromStr for NodeAvailability {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self::Active),
"offline" => Ok(Self::Offline),
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
}
}
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
/// type needs to be defined with diesel traits in there.
#[derive(Serialize, Deserialize, Clone, Copy)]
pub enum NodeSchedulingPolicy {
Active,
Filling,
Pause,
Draining,
}
impl FromStr for NodeSchedulingPolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self::Active),
"filling" => Ok(Self::Filling),
"pause" => Ok(Self::Pause),
"draining" => Ok(Self::Draining),
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
}
}
}
impl From<NodeSchedulingPolicy> for String {
fn from(value: NodeSchedulingPolicy) -> String {
use NodeSchedulingPolicy::*;
match value {
Active => "active",
Filling => "filling",
Pause => "pause",
Draining => "draining",
}
.to_string()
}
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateResponse {}
impl AttachmentService {
pub fn from_env(env: &LocalEnv) -> Self {
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
@@ -88,11 +204,12 @@ impl AttachmentService {
.pageservers
.first()
.expect("Config is validated to contain at least one pageserver");
let (private_key, public_key) = match ps_conf.http_auth_type {
let (jwt_token, public_key) = match ps_conf.http_auth_type {
AuthType::Trust => (None, None),
AuthType::NeonJWT => {
let private_key_path = env.get_private_key_path();
let private_key = fs::read(private_key_path).expect("failed to read private key");
let jwt_token = env
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
.unwrap();
// If pageserver auth is enabled, this implicitly enables auth for this service,
// using the same credentials.
@@ -118,7 +235,7 @@ impl AttachmentService {
} else {
std::fs::read_to_string(&public_key_path).expect("Can't read public key")
};
(Some(private_key), Some(public_key))
(Some(jwt_token), Some(public_key))
}
};
@@ -126,7 +243,7 @@ impl AttachmentService {
env: env.clone(),
path,
listen,
private_key,
jwt_token,
public_key,
postgres_port,
client: reqwest::ClientBuilder::new()
@@ -280,10 +397,7 @@ impl AttachmentService {
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
if let Some(private_key) = &self.private_key {
let claims = Claims::new(None, Scope::PageServerApi);
let jwt_token =
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
if let Some(jwt_token) = &self.jwt_token {
args.push(format!("--jwt-token={jwt_token}"));
}
@@ -308,7 +422,7 @@ impl AttachmentService {
)],
background_process::InitialPidFile::Create(self.pid_file()),
|| async {
match self.ready().await {
match self.status().await {
Ok(_) => Ok(true),
Err(_) => Ok(false),
}
@@ -354,20 +468,6 @@ impl AttachmentService {
Ok(())
}
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
let category = match path.find('/') {
Some(idx) => &path[..idx],
None => path,
};
match category {
"status" | "ready" => Ok(None),
"control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
"v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
_ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
}
}
/// Simple HTTP request wrapper for calling into attachment service
async fn dispatch<RQ, RS>(
&self,
@@ -393,16 +493,11 @@ impl AttachmentService {
if let Some(body) = body {
builder = builder.json(&body)
}
if let Some(private_key) = &self.private_key {
println!("Getting claims for path {}", path);
if let Some(required_claims) = Self::get_claims_for_path(&path)? {
println!("Got claims {:?} for path {}", required_claims, path);
let jwt_token = encode_from_key_file(&required_claims, private_key)?;
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
if let Some(jwt_token) = &self.jwt_token {
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
let response = builder.send().await?;
@@ -522,8 +617,8 @@ impl AttachmentService {
}
#[instrument(skip(self))]
pub async fn ready(&self) -> anyhow::Result<()> {
self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
pub async fn status(&self) -> anyhow::Result<()> {
self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
.await
}

View File

@@ -8,15 +8,14 @@
use anyhow::{anyhow, bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
use compute_api::spec::ComputeMode;
use control_plane::attachment_service::AttachmentService;
use control_plane::attachment_service::{
AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
};
use control_plane::endpoint::ComputeControlPlane;
use control_plane::local_env::{InitForceMode, LocalEnv};
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::{broker, local_env};
use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
};
use pageserver_api::models::{
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
};
@@ -451,7 +450,7 @@ async fn handle_tenant(
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: None,
shard_parameters: ShardParameters {
count: ShardCount::new(shard_count),
count: ShardCount(shard_count),
stripe_size: shard_stripe_size
.map(ShardStripeSize)
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
@@ -653,10 +652,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
let name = import_match
.get_one::<String>("node-name")
.ok_or_else(|| anyhow!("No node name provided"))?;
let update_catalog = import_match
.get_one::<bool>("update-catalog")
.cloned()
.unwrap_or_default();
// Parse base inputs
let base_tarfile = import_match
@@ -699,7 +694,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
None,
pg_version,
ComputeMode::Primary,
!update_catalog,
)?;
println!("Done");
}
@@ -837,10 +831,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.get_one::<String>("endpoint_id")
.map(String::to_string)
.unwrap_or_else(|| format!("ep-{branch_name}"));
let update_catalog = sub_args
.get_one::<bool>("update-catalog")
.cloned()
.unwrap_or_default();
let lsn = sub_args
.get_one::<String>("lsn")
@@ -890,7 +880,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
http_port,
pg_version,
mode,
!update_catalog,
)?;
}
"start" => {
@@ -929,11 +918,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.get(endpoint_id.as_str())
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
let create_test_user = sub_args
.get_one::<bool>("create-test-user")
.cloned()
.unwrap_or_default();
cplane.check_conflicting_endpoints(
endpoint.mode,
endpoint.tenant_id,
@@ -988,7 +972,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
pageservers,
remote_ext_config,
stripe_size.0 as usize,
create_test_user,
)
.await?;
}
@@ -1474,18 +1457,6 @@ fn cli() -> Command {
.required(false)
.default_value("1");
let update_catalog = Arg::new("update-catalog")
.value_parser(value_parser!(bool))
.long("update-catalog")
.help("If set, will set up the catalog for neon_superuser")
.required(false);
let create_test_user = Arg::new("create-test-user")
.value_parser(value_parser!(bool))
.long("create-test-user")
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
.required(false);
Command::new("Neon CLI")
.arg_required_else_help(true)
.version(GIT_VERSION)
@@ -1546,7 +1517,6 @@ fn cli() -> Command {
.arg(Arg::new("end-lsn").long("end-lsn")
.help("Lsn the basebackup ends at"))
.arg(pg_version_arg.clone())
.arg(update_catalog.clone())
)
).subcommand(
Command::new("tenant")
@@ -1660,7 +1630,6 @@ fn cli() -> Command {
.required(false))
.arg(pg_version_arg.clone())
.arg(hot_standby_arg.clone())
.arg(update_catalog)
)
.subcommand(Command::new("start")
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1668,7 +1637,6 @@ fn cli() -> Command {
.arg(endpoint_pageserver_id_arg.clone())
.arg(safekeepers_arg)
.arg(remote_ext_config_args)
.arg(create_test_user)
)
.subcommand(Command::new("reconfigure")
.about("Reconfigure the endpoint")

View File

@@ -41,15 +41,11 @@ use std::net::SocketAddr;
use std::net::TcpStream;
use std::path::PathBuf;
use std::process::Command;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::spec::Database;
use compute_api::spec::PgIdent;
use compute_api::spec::RemoteExtSpec;
use compute_api::spec::Role;
use nix::sys::signal::kill;
use nix::sys::signal::Signal;
use serde::{Deserialize, Serialize};
@@ -126,7 +122,6 @@ impl ComputeControlPlane {
http_port: Option<u16>,
pg_version: u32,
mode: ComputeMode,
skip_pg_catalog_updates: bool,
) -> Result<Arc<Endpoint>> {
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -145,7 +140,7 @@ impl ComputeControlPlane {
// before and after start are the same. So, skip catalog updates,
// with this we basically test a case of waking up an idle compute, where
// we also skip catalog updates in the cloud.
skip_pg_catalog_updates,
skip_pg_catalog_updates: true,
features: vec![],
});
@@ -160,7 +155,7 @@ impl ComputeControlPlane {
http_port,
pg_port,
pg_version,
skip_pg_catalog_updates,
skip_pg_catalog_updates: true,
features: vec![],
})?,
)?;
@@ -505,7 +500,6 @@ impl Endpoint {
pageservers: Vec<(Host, u16)>,
remote_ext_config: Option<&String>,
shard_stripe_size: usize,
create_test_user: bool,
) -> Result<()> {
if self.status() == EndpointStatus::Running {
anyhow::bail!("The endpoint is already running");
@@ -557,26 +551,8 @@ impl Endpoint {
cluster_id: None, // project ID: not used
name: None, // project name: not used
state: None,
roles: if create_test_user {
vec![Role {
name: PgIdent::from_str("test").unwrap(),
encrypted_password: None,
options: None,
}]
} else {
Vec::new()
},
databases: if create_test_user {
vec![Database {
name: PgIdent::from_str("neondb").unwrap(),
owner: PgIdent::from_str("test").unwrap(),
options: None,
restrict_conn: false,
invalid: false,
}]
} else {
Vec::new()
},
roles: vec![],
databases: vec![],
settings: None,
postgresql_conf: Some(postgresql_conf),
},
@@ -590,7 +566,6 @@ impl Endpoint {
remote_extensions,
pgbouncer_settings: None,
shard_stripe_size: Some(shard_stripe_size),
primary_is_running: None,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -602,16 +577,11 @@ impl Endpoint {
.open(self.endpoint_path().join("compute.log"))?;
// Launch compute_ctl
let conn_str = self.connstr("cloud_admin", "postgres");
println!("Starting postgres node at '{}'", conn_str);
if create_test_user {
let conn_str = self.connstr("user", "neondb");
println!("Also at '{}'", conn_str);
}
println!("Starting postgres node at '{}'", self.connstr());
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
cmd.args(["--http-port", &self.http_address.port().to_string()])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &conn_str])
.args(["--connstr", &self.connstr()])
.args([
"--spec-path",
self.endpoint_path().join("spec.json").to_str().unwrap(),
@@ -682,9 +652,7 @@ impl Endpoint {
}
ComputeStatus::Empty
| ComputeStatus::ConfigurationPending
| ComputeStatus::Configuration
| ComputeStatus::TerminationPending
| ComputeStatus::Terminated => {
| ComputeStatus::Configuration => {
bail!("unexpected compute status: {:?}", state.status)
}
}
@@ -815,13 +783,13 @@ impl Endpoint {
Ok(())
}
pub fn connstr(&self, user: &str, db_name: &str) -> String {
pub fn connstr(&self) -> String {
format!(
"postgresql://{}@{}:{}/{}",
user,
"cloud_admin",
self.pg_address.ip(),
self.pg_address.port(),
db_name
"postgres"
)
}
}

View File

@@ -412,17 +412,14 @@ impl LocalEnv {
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
let private_key_path = self.get_private_key_path();
let key_data = fs::read(private_key_path)?;
encode_from_key_file(claims, &key_data)
}
pub fn get_private_key_path(&self) -> PathBuf {
if self.private_key_path.is_absolute() {
let private_key_path = if self.private_key_path.is_absolute() {
self.private_key_path.to_path_buf()
} else {
self.base_data_dir.join(&self.private_key_path)
}
};
let key_data = fs::read(private_key_path)?;
encode_from_key_file(claims, &key_data)
}
//

View File

@@ -17,7 +17,6 @@ use std::time::Duration;
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use futures::SinkExt;
use pageserver_api::controller_api::NodeRegisterRequest;
use pageserver_api::models::{
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
};
@@ -31,7 +30,7 @@ use utils::{
lsn::Lsn,
};
use crate::attachment_service::AttachmentService;
use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
use crate::local_env::PageServerConf;
use crate::{background_process, local_env::LocalEnv};
@@ -116,7 +115,7 @@ impl PageServerNode {
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
let jwt_token = self
.env
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
.unwrap();
overrides.push(format!("control_plane_api_token='{}'", jwt_token));
}
@@ -211,25 +210,6 @@ impl PageServerNode {
update_config: bool,
register: bool,
) -> anyhow::Result<()> {
// Register the node with the storage controller before starting pageserver: pageserver must be registered to
// successfully call /re-attach and finish starting up.
if register {
let attachment_service = AttachmentService::from_env(&self.env);
let (pg_host, pg_port) =
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
attachment_service
.node_register(NodeRegisterRequest {
node_id: self.conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
}
// TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path();
print!(
@@ -268,6 +248,23 @@ impl PageServerNode {
)
.await?;
if register {
let attachment_service = AttachmentService::from_env(&self.env);
let (pg_host, pg_port) =
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
attachment_service
.node_register(NodeRegisterRequest {
node_id: self.conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
}
Ok(())
}
@@ -353,11 +350,6 @@ impl PageServerNode {
.remove("compaction_threshold")
.map(|x| x.parse::<usize>())
.transpose()?,
compaction_algorithm: settings
.remove("compaction_algorithm")
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'compaction_algorithm' json")?,
gc_horizon: settings
.remove("gc_horizon")
.map(|x| x.parse::<u64>())
@@ -397,17 +389,17 @@ impl PageServerNode {
evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold")
.map(|x| x.to_string()),
gc_feedback: settings
.remove("gc_feedback")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'lazy_slru_download' as bool")?,
timeline_get_throttle: settings
.remove("timeline_get_throttle")
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -461,11 +453,6 @@ impl PageServerNode {
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'compaction_threshold' as an integer")?,
compaction_algorithm: settings
.remove("compactin_algorithm")
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'compaction_algorithm' json")?,
gc_horizon: settings
.remove("gc_horizon")
.map(|x| x.parse::<u64>())
@@ -507,17 +494,17 @@ impl PageServerNode {
evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold")
.map(|x| x.to_string()),
gc_feedback: settings
.remove("gc_feedback")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'lazy_slru_download' as bool")?,
timeline_get_throttle: settings
.remove("timeline_get_throttle")
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
}
};

View File

@@ -70,9 +70,6 @@ Should only be used e.g. for status check/tenant creation/list.
Should only be used e.g. for status check.
Currently also used for connection from any pageserver to any safekeeper.
"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
"admin": Provides access to the control plane and admin APIs of the attachment service.
### CLI
CLI generates a key pair during call to `neon_local init` with the following commands:

View File

@@ -52,10 +52,6 @@ pub enum ComputeStatus {
// compute will exit soon or is waiting for
// control-plane to terminate it.
Failed,
// Termination requested
TerminationPending,
// Terminated Postgres
Terminated,
}
fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>

View File

@@ -79,12 +79,6 @@ pub struct ComputeSpec {
// Stripe size for pageserver sharding, in pages
#[serde(default)]
pub shard_stripe_size: Option<usize>,
// When we are starting a new replica in hot standby mode,
// we need to know if the primary is running.
// This is used to determine if replica should wait for
// RUNNING_XACTS from primary or not.
pub primary_is_running: Option<bool>,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.

View File

@@ -115,6 +115,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
// performed by the process.
// We know the size of the block, so we can determine the I/O bytes out of it.
// The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
#[allow(clippy::unnecessary_cast)]
fn update_rusage_metrics() {
let rusage_stats = get_rusage_stats();
@@ -201,11 +202,6 @@ impl<P: Atomic> GenericCounterPairVec<P> {
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
self.get_metric_with_label_values(vals).unwrap()
}
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
res[0] = self.inc.remove_label_values(vals);
res[1] = self.dec.remove_label_values(vals);
}
}
impl<P: Atomic> GenericCounterPair<P> {
@@ -252,15 +248,6 @@ impl<P: Atomic> GenericCounterPair<P> {
}
}
impl<P: Atomic> Clone for GenericCounterPair<P> {
fn clone(&self) -> Self {
Self {
inc: self.inc.clone(),
dec: self.dec.clone(),
}
}
}
/// Guard returned by [`GenericCounterPair::guard`]
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);

View File

@@ -18,11 +18,9 @@ enum-map.workspace = true
strum.workspace = true
strum_macros.workspace = true
hex.workspace = true
humantime.workspace = true
thiserror.workspace = true
humantime-serde.workspace = true
chrono.workspace = true
itertools.workspace = true
workspace_hack.workspace = true

View File

@@ -1,129 +0,0 @@
use std::str::FromStr;
/// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server
/// in [`attachment_service::http`]
use serde::{Deserialize, Serialize};
use utils::id::NodeId;
use crate::{models::ShardParameters, shard::TenantShardId};
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub generation: u32,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponse {
pub shards: Vec<TenantCreateResponseShard>,
}
#[derive(Serialize, Deserialize)]
pub struct NodeRegisterRequest {
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_http_addr: String,
pub listen_http_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct NodeConfigureRequest {
pub node_id: NodeId,
pub availability: Option<NodeAvailability>,
pub scheduling: Option<NodeSchedulingPolicy>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantLocateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_http_addr: String,
pub listen_http_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct TenantLocateResponse {
pub shards: Vec<TenantLocateResponseShard>,
pub shard_params: ShardParameters,
}
/// Explicitly migrating a particular shard is a low level operation
/// TODO: higher level "Reschedule tenant" operation where the request
/// specifies some constraints, e.g. asking it to get off particular node(s)
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: NodeId,
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
pub enum NodeAvailability {
// Normal, happy state
Active,
// Offline: Tenants shouldn't try to attach here, but they may assume that their
// secondary locations on this node still exist. Newly added nodes are in this
// state until we successfully contact them.
Offline,
}
impl FromStr for NodeAvailability {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self::Active),
"offline" => Ok(Self::Offline),
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
}
}
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
/// type needs to be defined with diesel traits in there.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
pub enum NodeSchedulingPolicy {
Active,
Filling,
Pause,
Draining,
}
impl FromStr for NodeSchedulingPolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self::Active),
"filling" => Ok(Self::Filling),
"pause" => Ok(Self::Pause),
"draining" => Ok(Self::Draining),
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
}
}
}
impl From<NodeSchedulingPolicy> for String {
fn from(value: NodeSchedulingPolicy) -> String {
use NodeSchedulingPolicy::*;
match value {
Active => "active",
Filling => "filling",
Pause => "pause",
Draining => "draining",
}
.to_string()
}
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateResponse {}

View File

@@ -2,7 +2,6 @@ use postgres_ffi::BLCKSZ;
use std::ops::Range;
use crate::key::Key;
use itertools::Itertools;
///
/// Represents a set of Keys, in a compact form.
@@ -64,36 +63,9 @@ impl KeySpace {
KeyPartitioning { parts }
}
/// Merge another keyspace into the current one.
/// Note: the keyspaces must not ovelap (enforced via assertions)
pub fn merge(&mut self, other: &KeySpace) {
let all_ranges = self
.ranges
.iter()
.merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start);
let mut accum = KeySpaceAccum::new();
let mut prev: Option<&Range<Key>> = None;
for range in all_ranges {
if let Some(prev) = prev {
let overlap =
std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
assert!(
!overlap,
"Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
prev, range
);
}
accum.add_range(range.clone());
prev = Some(range);
}
self.ranges = accum.to_keyspace().ranges;
}
/// Remove all keys in `other` from `self`.
/// This can involve splitting or removing of existing ranges.
/// Update the keyspace such that it doesn't contain any range
/// that is overlapping with `other`. This can involve splitting or
/// removing of existing ranges.
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
let (self_start, self_end) = match (self.start(), self.end()) {
(Some(start), Some(end)) => (start, end),
@@ -248,7 +220,16 @@ impl KeySpaceAccum {
}
pub fn consume_keyspace(&mut self) -> KeySpace {
std::mem::take(self).to_keyspace()
if let Some(accum) = self.accum.take() {
self.ranges.push(accum);
}
let mut prev_accum = KeySpaceAccum::new();
std::mem::swap(self, &mut prev_accum);
KeySpace {
ranges: prev_accum.ranges,
}
}
pub fn size(&self) -> u64 {
@@ -298,16 +279,8 @@ impl KeySpaceRandomAccum {
}
KeySpace { ranges }
}
pub fn consume_keyspace(&mut self) -> KeySpace {
let mut prev_accum = KeySpaceRandomAccum::new();
std::mem::swap(self, &mut prev_accum);
prev_accum.to_keyspace()
}
}
#[inline(always)]
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
let start = key_range.start;
let end = key_range.end;

View File

@@ -2,14 +2,13 @@
#![deny(clippy::undocumented_unsafe_blocks)]
use const_format::formatcp;
pub mod controller_api;
/// Public API types
pub mod control_api;
pub mod key;
pub mod keyspace;
pub mod models;
pub mod reltag;
pub mod shard;
/// Public API types
pub mod upcall_api;
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");

View File

@@ -1,7 +1,4 @@
pub mod partitioning;
pub mod utilization;
pub use utilization::PageserverUtilization;
use std::{
collections::HashMap,
@@ -183,7 +180,7 @@ pub enum TimelineState {
Broken { reason: String, backtrace: String },
}
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
pub new_timeline_id: TimelineId,
#[serde(default)]
@@ -217,14 +214,14 @@ impl ShardParameters {
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
pub fn is_unsharded(&self) -> bool {
self.count.is_unsharded()
self.count == ShardCount(0)
}
}
impl Default for ShardParameters {
fn default() -> Self {
Self {
count: ShardCount::new(0),
count: ShardCount(0),
stripe_size: Self::DEFAULT_STRIPE_SIZE,
}
}
@@ -272,8 +269,6 @@ pub struct TenantConfig {
pub compaction_target_size: Option<u64>,
pub compaction_period: Option<String>,
pub compaction_threshold: Option<usize>,
// defer parsing compaction_algorithm, like eviction_policy
pub compaction_algorithm: Option<CompactionAlgorithm>,
pub gc_horizon: Option<u64>,
pub gc_period: Option<String>,
pub image_creation_threshold: Option<usize>,
@@ -285,9 +280,9 @@ pub struct TenantConfig {
pub eviction_policy: Option<EvictionPolicy>,
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
pub heatmap_period: Option<String>,
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -295,7 +290,6 @@ pub struct TenantConfig {
pub enum EvictionPolicy {
NoEviction,
LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
OnlyImitiate(EvictionPolicyLayerAccessThreshold),
}
impl EvictionPolicy {
@@ -303,18 +297,10 @@ impl EvictionPolicy {
match self {
EvictionPolicy::NoEviction => "NoEviction",
EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "kind")]
pub enum CompactionAlgorithm {
Legacy,
Tiered,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct EvictionPolicyLayerAccessThreshold {
#[serde(with = "humantime_serde")]
@@ -323,39 +309,10 @@ pub struct EvictionPolicyLayerAccessThreshold {
pub threshold: Duration,
}
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub struct ThrottleConfig {
pub task_kinds: Vec<String>, // TaskKind
pub initial: usize,
#[serde(with = "humantime_serde")]
pub refill_interval: Duration,
pub refill_amount: NonZeroUsize,
pub max: usize,
pub fair: bool,
}
impl ThrottleConfig {
pub fn disabled() -> Self {
Self {
task_kinds: vec![], // effectively disables the throttle
// other values don't matter with emtpy `task_kinds`.
initial: 0,
refill_interval: Duration::from_millis(1),
refill_amount: NonZeroUsize::new(1).unwrap(),
max: 1,
fair: true,
}
}
/// The requests per second allowed by the given config.
pub fn steady_rps(&self) -> f64 {
(self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
}
}
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
/// lists out all possible states (and the virtual "Detached" state)
/// in a flat form rather than using rust-style enums.
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
pub enum LocationConfigMode {
AttachedSingle,
AttachedMulti,
@@ -419,12 +376,6 @@ pub struct TenantLocationConfigRequest {
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantTimeTravelRequest {
pub shard_counts: Vec<ShardCount>,
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantShardLocation {

View File

@@ -1,70 +0,0 @@
use std::time::SystemTime;
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
/// the next tenant.
///
/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
///
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
/// not handle full u64 values properly.
#[derive(serde::Serialize, Debug)]
pub struct PageserverUtilization {
/// Used disk space
#[serde(serialize_with = "ser_saturating_u63")]
pub disk_usage_bytes: u64,
/// Free disk space
#[serde(serialize_with = "ser_saturating_u63")]
pub free_space_bytes: u64,
/// Lower is better score for how good candidate for a next tenant would this pageserver be.
#[serde(serialize_with = "ser_saturating_u63")]
pub utilization_score: u64,
/// When was this snapshot captured, pageserver local time.
///
/// Use millis to give confidence that the value is regenerated often enough.
#[serde(serialize_with = "ser_rfc3339_millis")]
pub captured_at: SystemTime,
}
fn ser_rfc3339_millis<S: serde::Serializer>(
ts: &SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
///
/// Instead of newtype, use this because a newtype would get require handling deserializing values
/// with the highest bit set which is properly parsed by serde formats, but would create a
/// conundrum on how to handle and again serialize such values at type level. It will be a few
/// years until we can use more than `i64::MAX` bytes on a disk.
fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
let value = (*value).min(MAX_FORMAT_INT64);
serializer.serialize_u64(value)
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use super::*;
#[test]
fn u64_max_is_serialized_as_u63_max() {
let doc = PageserverUtilization {
disk_usage_bytes: u64::MAX,
free_space_bytes: 0,
utilization_score: u64::MAX,
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
};
let s = serde_json::to_string(&doc).unwrap();
let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
assert_eq!(s, expected);
}
}

View File

@@ -13,41 +13,10 @@ use utils::id::TenantId;
pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardCount(u8);
pub struct ShardCount(pub u8);
impl ShardCount {
pub const MAX: Self = Self(u8::MAX);
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
/// legacy format for TenantShardId that excludes the shard suffix", also known
/// as `TenantShardId::unsharded`.
///
/// This method returns the actual number of shards, i.e. if our internal value is
/// zero, we return 1 (unsharded tenants have 1 shard).
pub fn count(&self) -> u8 {
if self.0 > 0 {
self.0
} else {
1
}
}
/// The literal internal value: this is **not** the number of shards in the
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
/// [`Self::count`] if you want to know the cardinality of shards.
pub fn literal(&self) -> u8 {
self.0
}
pub fn is_unsharded(&self) -> bool {
self.0 == 0
}
/// `v` may be zero, or the number of shards in the tenant. `v` is what
/// [`Self::literal`] would return.
pub fn new(val: u8) -> Self {
Self(val)
}
}
impl ShardNumber {
@@ -117,7 +86,7 @@ impl TenantShardId {
}
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
}
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
@@ -502,12 +471,10 @@ impl ShardIdentity {
pub fn is_key_disposable(&self, key: &Key) -> bool {
if key_is_shard0(key) {
// Q: Why can't we dispose of shard0 content if we're not shard 0?
// A1: because the WAL ingestion logic currently ingests some shard 0
// content on all shards, even though it's only read on shard 0. If we
// dropped it, then subsequent WAL ingest to these keys would encounter
// an error.
// A2: because key_is_shard0 also covers relation size keys, which are written
// on all shards even though they're only maintained accurately on shard 0.
// A: because the WAL ingestion logic currently ingests some shard 0
// content on all shards, even though it's only read on shard 0. If we
// dropped it, then subsequent WAL ingest to these keys would encounter
// an error.
false
} else {
!self.is_key_local(key)

View File

@@ -3,7 +3,7 @@
#![allow(non_snake_case)]
// bindgen creates some unsafe code with no doc comments.
#![allow(clippy::missing_safety_doc)]
// noted at 1.63 that in many cases there's u32 -> u32 transmutes in bindgen code.
// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
#![allow(clippy::useless_transmute)]
// modules included with the postgres_ffi macro depend on the types of the specific version's
// types, and trigger a too eager lint.

View File

@@ -80,9 +80,6 @@ pub const XLOG_XACT_ABORT: u8 = 0x20;
pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
// From standbydefs.h
pub const XLOG_RUNNING_XACTS: u8 = 0x10;
// From srlu.h
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;

View File

@@ -119,6 +119,11 @@ pub fn generate_pg_control(
// Generate new pg_control needed for bootstrap
checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
//reset some fields we don't want to preserve
//TODO Check this.
//We may need to determine the value from twophase data.
checkpoint.oldestActiveXid = 0;
//save new values in pg_control
pg_control.checkPoint = 0;
pg_control.checkPointCopy = checkpoint;

View File

@@ -15,13 +15,11 @@ aws-sdk-s3.workspace = true
aws-credential-types.workspace = true
bytes.workspace = true
camino.workspace = true
humantime.workspace = true
hyper = { workspace = true, features = ["stream"] }
futures.workspace = true
serde.workspace = true
serde_json.workspace = true
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
tokio-stream.workspace = true
tokio-util = { workspace = true, features = ["compat"] }
toml_edit.workspace = true
tracing.workspace = true

View File

@@ -22,15 +22,16 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl
use bytes::Bytes;
use futures::stream::Stream;
use futures_util::StreamExt;
use futures_util::TryStreamExt;
use http_types::{StatusCode, Url};
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use crate::s3_bucket::RequestKind;
use crate::TimeTravelError;
use crate::{
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
TimeTravelError, TimeoutOrCancel,
AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
RemoteStorage, StorageMetadata,
};
pub struct AzureBlobStorage {
@@ -38,12 +39,10 @@ pub struct AzureBlobStorage {
prefix_in_container: Option<String>,
max_keys_per_list_response: Option<NonZeroU32>,
concurrency_limiter: ConcurrencyLimiter,
// Per-request timeout. Accessible for tests.
pub timeout: Duration,
}
impl AzureBlobStorage {
pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result<Self> {
pub fn new(azure_config: &AzureConfig) -> Result<Self> {
debug!(
"Creating azure remote storage for azure container {}",
azure_config.container_name
@@ -80,7 +79,6 @@ impl AzureBlobStorage {
prefix_in_container: azure_config.prefix_in_container.to_owned(),
max_keys_per_list_response,
concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
timeout,
})
}
@@ -123,11 +121,8 @@ impl AzureBlobStorage {
async fn download_for_builder(
&self,
builder: GetBlobBuilder,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
let kind = RequestKind::Get;
let _permit = self.permit(kind, cancel).await?;
let mut response = builder.into_stream();
let mut etag = None;
let mut last_modified = None;
@@ -135,70 +130,39 @@ impl AzureBlobStorage {
// TODO give proper streaming response instead of buffering into RAM
// https://github.com/neondatabase/neon/issues/5563
let download = async {
let response = builder
// convert to concrete Pageable
.into_stream()
// convert to TryStream
.into_stream()
.map_err(to_download_error);
// apply per request timeout
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
// flatten
let response = response.map(|res| match res {
Ok(res) => res,
Err(_elapsed) => Err(DownloadError::Timeout),
});
let mut response = std::pin::pin!(response);
let mut bufs = Vec::new();
while let Some(part) = response.next().await {
let part = part?;
let etag_str: &str = part.blob.properties.etag.as_ref();
if etag.is_none() {
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
}
if last_modified.is_none() {
last_modified = Some(part.blob.properties.last_modified.into());
}
if let Some(blob_meta) = part.blob.metadata {
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
}
let data = part
.data
.collect()
.await
.map_err(|e| DownloadError::Other(e.into()))?;
bufs.push(data);
let mut bufs = Vec::new();
while let Some(part) = response.next().await {
let part = part.map_err(to_download_error)?;
let etag_str: &str = part.blob.properties.etag.as_ref();
if etag.is_none() {
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
}
Ok(Download {
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
etag,
last_modified,
metadata: Some(StorageMetadata(metadata)),
})
};
tokio::select! {
bufs = download => bufs,
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
if last_modified.is_none() {
last_modified = Some(part.blob.properties.last_modified.into());
}
if let Some(blob_meta) = part.blob.metadata {
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
}
let data = part
.data
.collect()
.await
.map_err(|e| DownloadError::Other(e.into()))?;
bufs.push(data);
}
Ok(Download {
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
etag,
last_modified,
metadata: Some(StorageMetadata(metadata)),
})
}
async fn permit(
&self,
kind: RequestKind,
cancel: &CancellationToken,
) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
let acquire = self.concurrency_limiter.acquire(kind);
tokio::select! {
permit = acquire => Ok(permit.expect("never closed")),
_ = cancel.cancelled() => Err(Cancelled),
}
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
self.concurrency_limiter
.acquire(kind)
.await
.expect("semaphore is never closed")
}
}
@@ -228,87 +192,66 @@ impl RemoteStorage for AzureBlobStorage {
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> anyhow::Result<Listing, DownloadError> {
let _permit = self.permit(RequestKind::List, cancel).await?;
let op = async {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| self.relative_path_to_name(p))
.or_else(|| self.prefix_in_container.clone())
.map(|mut p| {
// required to end with a separator
// otherwise request will return only the entry of a prefix
if matches!(mode, ListingMode::WithDelimiter)
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
{
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
}
p
});
let mut builder = self.client.list_blobs();
if let ListingMode::WithDelimiter = mode {
builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
}
if let Some(prefix) = list_prefix {
builder = builder.prefix(Cow::from(prefix.to_owned()));
}
if let Some(limit) = self.max_keys_per_list_response {
builder = builder.max_results(MaxResults::new(limit));
}
let response = builder.into_stream();
let response = response.into_stream().map_err(to_download_error);
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
let response = response.map(|res| match res {
Ok(res) => res,
Err(_elapsed) => Err(DownloadError::Timeout),
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| self.relative_path_to_name(p))
.or_else(|| self.prefix_in_container.clone())
.map(|mut p| {
// required to end with a separator
// otherwise request will return only the entry of a prefix
if matches!(mode, ListingMode::WithDelimiter)
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
{
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
}
p
});
let mut response = std::pin::pin!(response);
let mut builder = self.client.list_blobs();
let mut res = Listing::default();
if let ListingMode::WithDelimiter = mode {
builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
}
let mut max_keys = max_keys.map(|mk| mk.get());
while let Some(entry) = response.next().await {
let entry = entry?;
let prefix_iter = entry
.blobs
.prefixes()
.map(|prefix| self.name_to_relative_path(&prefix.name));
res.prefixes.extend(prefix_iter);
if let Some(prefix) = list_prefix {
builder = builder.prefix(Cow::from(prefix.to_owned()));
}
let blob_iter = entry
.blobs
.blobs()
.map(|k| self.name_to_relative_path(&k.name));
if let Some(limit) = self.max_keys_per_list_response {
builder = builder.max_results(MaxResults::new(limit));
}
for key in blob_iter {
res.keys.push(key);
let mut response = builder.into_stream();
let mut res = Listing::default();
// NonZeroU32 doesn't support subtraction apparently
let mut max_keys = max_keys.map(|mk| mk.get());
while let Some(l) = response.next().await {
let entry = l.map_err(to_download_error)?;
let prefix_iter = entry
.blobs
.prefixes()
.map(|prefix| self.name_to_relative_path(&prefix.name));
res.prefixes.extend(prefix_iter);
if let Some(mut mk) = max_keys {
assert!(mk > 0);
mk -= 1;
if mk == 0 {
return Ok(res); // limit reached
}
max_keys = Some(mk);
let blob_iter = entry
.blobs
.blobs()
.map(|k| self.name_to_relative_path(&k.name));
for key in blob_iter {
res.keys.push(key);
if let Some(mut mk) = max_keys {
assert!(mk > 0);
mk -= 1;
if mk == 0 {
return Ok(res); // limit reached
}
max_keys = Some(mk);
}
}
Ok(res)
};
tokio::select! {
res = op => res,
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
}
Ok(res)
}
async fn upload(
@@ -317,52 +260,35 @@ impl RemoteStorage for AzureBlobStorage {
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let _permit = self.permit(RequestKind::Put, cancel).await?;
let _permit = self.permit(RequestKind::Put).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
let op = async {
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
Box::pin(from);
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
Box::pin(from);
let from = NonSeekableStream::new(from, data_size_bytes);
let from = NonSeekableStream::new(from, data_size_bytes);
let body = azure_core::Body::SeekableStream(Box::new(from));
let body = azure_core::Body::SeekableStream(Box::new(from));
let mut builder = blob_client.put_block_blob(body);
let mut builder = blob_client.put_block_blob(body);
if let Some(metadata) = metadata {
builder = builder.metadata(to_azure_metadata(metadata));
}
let fut = builder.into_future();
let fut = tokio::time::timeout(self.timeout, fut);
match fut.await {
Ok(Ok(_response)) => Ok(()),
Ok(Err(azure)) => Err(azure.into()),
Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
}
};
tokio::select! {
res = op => res,
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
if let Some(metadata) = metadata {
builder = builder.metadata(to_azure_metadata(metadata));
}
let _response = builder.into_future().await?;
Ok(())
}
async fn download(
&self,
from: &RemotePath,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
let _permit = self.permit(RequestKind::Get).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
let builder = blob_client.get();
self.download_for_builder(builder, cancel).await
self.download_for_builder(builder).await
}
async fn download_byte_range(
@@ -370,8 +296,8 @@ impl RemoteStorage for AzureBlobStorage {
from: &RemotePath,
start_inclusive: u64,
end_exclusive: Option<u64>,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
let _permit = self.permit(RequestKind::Get).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
let mut builder = blob_client.get();
@@ -383,113 +309,82 @@ impl RemoteStorage for AzureBlobStorage {
};
builder = builder.range(range);
self.download_for_builder(builder, cancel).await
self.download_for_builder(builder).await
}
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
self.delete_objects(std::array::from_ref(path), cancel)
.await
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let _permit = self.permit(RequestKind::Delete).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
async fn delete_objects<'a>(
&self,
paths: &'a [RemotePath],
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let _permit = self.permit(RequestKind::Delete, cancel).await?;
let builder = blob_client.delete();
let op = async {
// TODO batch requests are also not supported by the SDK
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
for path in paths {
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
let request = blob_client.delete().into_future();
let res = tokio::time::timeout(self.timeout, request).await;
match res {
Ok(Ok(_response)) => continue,
Ok(Err(e)) => {
if let Some(http_err) = e.as_http_error() {
if http_err.status() == StatusCode::NotFound {
continue;
}
}
return Err(e.into());
match builder.into_future().await {
Ok(_response) => Ok(()),
Err(e) => {
if let Some(http_err) = e.as_http_error() {
if http_err.status() == StatusCode::NotFound {
return Ok(());
}
Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
}
Err(anyhow::Error::new(e))
}
Ok(())
};
tokio::select! {
res = op => res,
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
}
}
async fn copy(
&self,
from: &RemotePath,
to: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let _permit = self.permit(RequestKind::Copy, cancel).await?;
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
// Permit is already obtained by inner delete function
let timeout = tokio::time::sleep(self.timeout);
// TODO batch requests are also not supported by the SDK
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
for path in paths {
self.delete(path).await?;
}
Ok(())
}
let mut copy_status = None;
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
let _permit = self.permit(RequestKind::Copy).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
let op = async {
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
let source_url = format!(
"{}/{}",
self.client.url()?,
self.relative_path_to_name(from)
);
let builder = blob_client.copy(Url::from_str(&source_url)?);
let source_url = format!(
"{}/{}",
self.client.url()?,
self.relative_path_to_name(from)
);
let result = builder.into_future().await?;
let builder = blob_client.copy(Url::from_str(&source_url)?);
let copy = builder.into_future();
let result = copy.await?;
copy_status = Some(result.copy_status);
loop {
match copy_status.as_ref().expect("we always set it to Some") {
CopyStatus::Aborted => {
anyhow::bail!("Received abort for copy from {from} to {to}.");
}
CopyStatus::Failed => {
anyhow::bail!("Received failure response for copy from {from} to {to}.");
}
CopyStatus::Success => return Ok(()),
CopyStatus::Pending => (),
let mut copy_status = result.copy_status;
let start_time = Instant::now();
const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
loop {
match copy_status {
CopyStatus::Aborted => {
anyhow::bail!("Received abort for copy from {from} to {to}.");
}
// The copy is taking longer. Waiting a second and then re-trying.
// TODO estimate time based on copy_progress and adjust time based on that
tokio::time::sleep(Duration::from_millis(1000)).await;
let properties = blob_client.get_properties().into_future().await?;
let Some(status) = properties.blob.properties.copy_status else {
tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
return Ok(());
};
copy_status = Some(status);
CopyStatus::Failed => {
anyhow::bail!("Received failure response for copy from {from} to {to}.");
}
CopyStatus::Success => return Ok(()),
CopyStatus::Pending => (),
}
};
tokio::select! {
res = op => res,
_ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
_ = timeout => {
let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
let e = e.context(format!("Timeout, last status: {copy_status:?}"));
Err(e)
},
// The copy is taking longer. Waiting a second and then re-trying.
// TODO estimate time based on copy_progress and adjust time based on that
tokio::time::sleep(Duration::from_millis(1000)).await;
let properties = blob_client.get_properties().into_future().await?;
let Some(status) = properties.blob.properties.copy_status else {
tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
return Ok(());
};
if start_time.elapsed() > MAX_WAIT_TIME {
anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
MAX_WAIT_TIME.as_secs_f32(),
properties.blob.properties.copy_progress,
);
}
copy_status = status;
}
}

View File

@@ -1,200 +0,0 @@
/// Reasons for downloads or listings to fail.
#[derive(Debug)]
pub enum DownloadError {
/// Validation or other error happened due to user input.
BadInput(anyhow::Error),
/// The file was not found in the remote storage.
NotFound,
/// A cancellation token aborted the download, typically during
/// tenant detach or process shutdown.
Cancelled,
/// A timeout happened while executing the request. Possible reasons:
/// - stuck tcp connection
///
/// Concurrency control is not timed within timeout.
Timeout,
/// The file was found in the remote storage, but the download failed.
Other(anyhow::Error),
}
impl std::fmt::Display for DownloadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
DownloadError::BadInput(e) => {
write!(f, "Failed to download a remote file due to user input: {e}")
}
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
DownloadError::Timeout => write!(f, "timeout"),
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
}
}
}
impl std::error::Error for DownloadError {}
impl DownloadError {
/// Returns true if the error should not be retried with backoff
pub fn is_permanent(&self) -> bool {
use DownloadError::*;
match self {
BadInput(_) | NotFound | Cancelled => true,
Timeout | Other(_) => false,
}
}
}
impl From<std::io::Error> for DownloadError {
fn from(value: std::io::Error) -> Self {
let needs_unwrap = value.kind() == std::io::ErrorKind::Other
&& value
.get_ref()
.and_then(|x| x.downcast_ref::<DownloadError>())
.is_some();
if needs_unwrap {
*value
.into_inner()
.expect("just checked")
.downcast::<DownloadError>()
.expect("just checked")
} else {
DownloadError::Other(value.into())
}
}
}
#[derive(Debug)]
pub enum TimeTravelError {
/// Validation or other error happened due to user input.
BadInput(anyhow::Error),
/// The used remote storage does not have time travel recovery implemented
Unimplemented,
/// The number of versions/deletion markers is above our limit.
TooManyVersions,
/// A cancellation token aborted the process, typically during
/// request closure or process shutdown.
Cancelled,
/// Other errors
Other(anyhow::Error),
}
impl std::fmt::Display for TimeTravelError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TimeTravelError::BadInput(e) => {
write!(
f,
"Failed to time travel recover a prefix due to user input: {e}"
)
}
TimeTravelError::Unimplemented => write!(
f,
"time travel recovery is not implemented for the current storage backend"
),
TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
TimeTravelError::TooManyVersions => {
write!(f, "Number of versions/delete markers above limit")
}
TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
}
}
}
impl std::error::Error for TimeTravelError {}
/// Plain cancelled error.
///
/// By design this type does not not implement `std::error::Error` so it cannot be put as the root
/// cause of `std::io::Error` or `anyhow::Error`. It should never need to be exposed out of this
/// crate.
///
/// It exists to implement permit acquiring in `{Download,TimeTravel}Error` and `anyhow::Error` returning
/// operations and ensuring that those get converted to proper versions with just `?`.
#[derive(Debug)]
pub(crate) struct Cancelled;
impl From<Cancelled> for anyhow::Error {
fn from(_: Cancelled) -> Self {
anyhow::Error::new(TimeoutOrCancel::Cancel)
}
}
impl From<Cancelled> for TimeTravelError {
fn from(_: Cancelled) -> Self {
TimeTravelError::Cancelled
}
}
impl From<Cancelled> for TimeoutOrCancel {
fn from(_: Cancelled) -> Self {
TimeoutOrCancel::Cancel
}
}
impl From<Cancelled> for DownloadError {
fn from(_: Cancelled) -> Self {
DownloadError::Cancelled
}
}
/// This type is used at as the root cause for timeouts and cancellations with `anyhow::Error` returning
/// RemoteStorage methods.
///
/// For use with `utils::backoff::retry` and `anyhow::Error` returning operations there is
/// `TimeoutOrCancel::caused_by_cancel` method to query "proper form" errors.
#[derive(Debug)]
pub enum TimeoutOrCancel {
Timeout,
Cancel,
}
impl std::fmt::Display for TimeoutOrCancel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use TimeoutOrCancel::*;
match self {
Timeout => write!(f, "timeout"),
Cancel => write!(f, "cancel"),
}
}
}
impl std::error::Error for TimeoutOrCancel {}
impl TimeoutOrCancel {
/// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
error
.root_cause()
.downcast_ref::<Self>()
.is_some_and(Self::is_cancel)
}
pub fn is_cancel(&self) -> bool {
matches!(self, TimeoutOrCancel::Cancel)
}
pub fn is_timeout(&self) -> bool {
matches!(self, TimeoutOrCancel::Timeout)
}
}
/// This conversion is used when [`crate::support::DownloadStream`] notices a cancellation or
/// timeout to wrap it in an `std::io::Error`.
impl From<TimeoutOrCancel> for std::io::Error {
fn from(value: TimeoutOrCancel) -> Self {
let e = DownloadError::from(value);
std::io::Error::other(e)
}
}
impl From<TimeoutOrCancel> for DownloadError {
fn from(value: TimeoutOrCancel) -> Self {
use TimeoutOrCancel::*;
match value {
Timeout => DownloadError::Timeout,
Cancel => DownloadError::Cancelled,
}
}
}

View File

@@ -10,7 +10,6 @@
#![deny(clippy::undocumented_unsafe_blocks)]
mod azure_blob;
mod error;
mod local_fs;
mod s3_bucket;
mod simulate_failures;
@@ -22,7 +21,7 @@ use std::{
num::{NonZeroU32, NonZeroUsize},
pin::Pin,
sync::Arc,
time::{Duration, SystemTime},
time::SystemTime,
};
use anyhow::{bail, Context};
@@ -42,8 +41,6 @@ pub use self::{
};
use s3_bucket::RequestKind;
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
/// ~200 RPS for IAM services
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -161,10 +158,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
let result = self
.list(prefix, ListingMode::WithDelimiter, None, cancel)
.list(prefix, ListingMode::WithDelimiter, None)
.await?
.prefixes;
Ok(result)
@@ -186,10 +182,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
&self,
prefix: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
let result = self
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
.list(prefix, ListingMode::NoDelimiter, max_keys)
.await?
.keys;
Ok(result)
@@ -200,13 +195,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
prefix: Option<&RemotePath>,
_mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Listing, DownloadError>;
/// Streams the local file contents into remote into the remote storage entry.
///
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
/// set to `TimeoutOrCancel`.
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
@@ -215,61 +206,27 @@ pub trait RemoteStorage: Send + Sync + 'static {
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<()>;
/// Streams the remote storage entry contents.
///
/// The returned download stream will obey initial timeout and cancellation signal by erroring
/// on whichever happens first. Only one of the reasons will fail the stream, which is usually
/// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
///
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
/// Returns the metadata, if any was stored with the file previously.
async fn download(
&self,
from: &RemotePath,
cancel: &CancellationToken,
) -> Result<Download, DownloadError>;
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError>;
/// Streams a given byte range of the remote storage entry contents.
///
/// The returned download stream will obey initial timeout and cancellation signal by erroring
/// on whichever happens first. Only one of the reasons will fail the stream, which is usually
/// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
///
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
/// Returns the metadata, if any was stored with the file previously.
async fn download_byte_range(
&self,
from: &RemotePath,
start_inclusive: u64,
end_exclusive: Option<u64>,
cancel: &CancellationToken,
) -> Result<Download, DownloadError>;
/// Delete a single path from remote storage.
///
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
/// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through.
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>;
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
/// Delete a multiple paths from remote storage.
///
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
/// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
/// through.
async fn delete_objects<'a>(
&self,
paths: &'a [RemotePath],
cancel: &CancellationToken,
) -> anyhow::Result<()>;
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
/// Copy a remote object inside a bucket from one path to another.
async fn copy(
&self,
from: &RemotePath,
to: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()>;
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
/// Resets the content of everything with the given prefix to the given state
async fn time_travel_recover(
@@ -281,13 +238,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
) -> Result<(), TimeTravelError>;
}
/// DownloadStream is sensitive to the timeout and cancellation used with the original
/// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
/// with `tokio::io::copy_buf`.
// This has 'static because safekeepers do not use cancellation tokens (yet)
pub type DownloadStream =
Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>>;
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
pub struct Download {
pub download_stream: DownloadStream,
/// The last time the file was modified (`last-modified` HTTP header)
@@ -306,6 +257,86 @@ impl Debug for Download {
}
}
#[derive(Debug)]
pub enum DownloadError {
/// Validation or other error happened due to user input.
BadInput(anyhow::Error),
/// The file was not found in the remote storage.
NotFound,
/// A cancellation token aborted the download, typically during
/// tenant detach or process shutdown.
Cancelled,
/// The file was found in the remote storage, but the download failed.
Other(anyhow::Error),
}
impl std::fmt::Display for DownloadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
DownloadError::BadInput(e) => {
write!(f, "Failed to download a remote file due to user input: {e}")
}
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
}
}
}
impl std::error::Error for DownloadError {}
impl DownloadError {
/// Returns true if the error should not be retried with backoff
pub fn is_permanent(&self) -> bool {
use DownloadError::*;
match self {
BadInput(_) => true,
NotFound => true,
Cancelled => true,
Other(_) => false,
}
}
}
#[derive(Debug)]
pub enum TimeTravelError {
/// Validation or other error happened due to user input.
BadInput(anyhow::Error),
/// The used remote storage does not have time travel recovery implemented
Unimplemented,
/// The number of versions/deletion markers is above our limit.
TooManyVersions,
/// A cancellation token aborted the process, typically during
/// request closure or process shutdown.
Cancelled,
/// Other errors
Other(anyhow::Error),
}
impl std::fmt::Display for TimeTravelError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TimeTravelError::BadInput(e) => {
write!(
f,
"Failed to time travel recover a prefix due to user input: {e}"
)
}
TimeTravelError::Unimplemented => write!(
f,
"time travel recovery is not implemented for the current storage backend"
),
TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
TimeTravelError::TooManyVersions => {
write!(f, "Number of versions/delete markers above limit")
}
TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
}
}
}
impl std::error::Error for TimeTravelError {}
/// Every storage, currently supported.
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
#[derive(Clone)]
@@ -323,13 +354,12 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> anyhow::Result<Listing, DownloadError> {
match self {
Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await,
Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await,
Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
}
}
@@ -342,13 +372,12 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
&self,
folder: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
match self {
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
Self::LocalFs(s) => s.list_files(folder, max_keys).await,
Self::AwsS3(s) => s.list_files(folder, max_keys).await,
Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
Self::Unreliable(s) => s.list_files(folder, max_keys).await,
}
}
@@ -358,43 +387,36 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
pub async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
match self {
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
Self::LocalFs(s) => s.list_prefixes(prefix).await,
Self::AwsS3(s) => s.list_prefixes(prefix).await,
Self::AzureBlob(s) => s.list_prefixes(prefix).await,
Self::Unreliable(s) => s.list_prefixes(prefix).await,
}
}
/// See [`RemoteStorage::upload`]
pub async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
}
}
pub async fn download(
&self,
from: &RemotePath,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
pub async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
match self {
Self::LocalFs(s) => s.download(from, cancel).await,
Self::AwsS3(s) => s.download(from, cancel).await,
Self::AzureBlob(s) => s.download(from, cancel).await,
Self::Unreliable(s) => s.download(from, cancel).await,
Self::LocalFs(s) => s.download(from).await,
Self::AwsS3(s) => s.download(from).await,
Self::AzureBlob(s) => s.download(from).await,
Self::Unreliable(s) => s.download(from).await,
}
}
@@ -403,72 +425,54 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
from: &RemotePath,
start_inclusive: u64,
end_exclusive: Option<u64>,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
match self {
Self::LocalFs(s) => {
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
s.download_byte_range(from, start_inclusive, end_exclusive)
.await
}
Self::AwsS3(s) => {
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
s.download_byte_range(from, start_inclusive, end_exclusive)
.await
}
Self::AzureBlob(s) => {
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
s.download_byte_range(from, start_inclusive, end_exclusive)
.await
}
Self::Unreliable(s) => {
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
s.download_byte_range(from, start_inclusive, end_exclusive)
.await
}
}
}
/// See [`RemoteStorage::delete`]
pub async fn delete(
&self,
path: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.delete(path, cancel).await,
Self::AwsS3(s) => s.delete(path, cancel).await,
Self::AzureBlob(s) => s.delete(path, cancel).await,
Self::Unreliable(s) => s.delete(path, cancel).await,
Self::LocalFs(s) => s.delete(path).await,
Self::AwsS3(s) => s.delete(path).await,
Self::AzureBlob(s) => s.delete(path).await,
Self::Unreliable(s) => s.delete(path).await,
}
}
/// See [`RemoteStorage::delete_objects`]
pub async fn delete_objects(
&self,
paths: &[RemotePath],
cancel: &CancellationToken,
) -> anyhow::Result<()> {
pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.delete_objects(paths, cancel).await,
Self::AwsS3(s) => s.delete_objects(paths, cancel).await,
Self::AzureBlob(s) => s.delete_objects(paths, cancel).await,
Self::Unreliable(s) => s.delete_objects(paths, cancel).await,
Self::LocalFs(s) => s.delete_objects(paths).await,
Self::AwsS3(s) => s.delete_objects(paths).await,
Self::AzureBlob(s) => s.delete_objects(paths).await,
Self::Unreliable(s) => s.delete_objects(paths).await,
}
}
/// See [`RemoteStorage::copy`]
pub async fn copy_object(
&self,
from: &RemotePath,
to: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.copy(from, to, cancel).await,
Self::AwsS3(s) => s.copy(from, to, cancel).await,
Self::AzureBlob(s) => s.copy(from, to, cancel).await,
Self::Unreliable(s) => s.copy(from, to, cancel).await,
Self::LocalFs(s) => s.copy(from, to).await,
Self::AwsS3(s) => s.copy(from, to).await,
Self::AzureBlob(s) => s.copy(from, to).await,
Self::Unreliable(s) => s.copy(from, to).await,
}
}
/// See [`RemoteStorage::time_travel_recover`].
pub async fn time_travel_recover(
&self,
prefix: Option<&RemotePath>,
@@ -499,11 +503,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
impl GenericRemoteStorage {
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
let timeout = storage_config.timeout;
Ok(match &storage_config.storage {
RemoteStorageKind::LocalFs(path) => {
info!("Using fs root '{path}' as a remote storage");
Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
RemoteStorageKind::LocalFs(root) => {
info!("Using fs root '{root}' as a remote storage");
Self::LocalFs(LocalFs::new(root.clone())?)
}
RemoteStorageKind::AwsS3(s3_config) => {
// The profile and access key id are only printed here for debugging purposes,
@@ -513,12 +516,12 @@ impl GenericRemoteStorage {
std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
}
RemoteStorageKind::AzureContainer(azure_config) => {
info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
}
})
}
@@ -527,15 +530,18 @@ impl GenericRemoteStorage {
Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
}
/// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
/// Takes storage object contents and its size and uploads to remote storage,
/// mapping `from_path` to the corresponding remote object id in the storage.
///
/// The storage object does not have to be present on the `from_path`,
/// this path is used for the remote object id conversion only.
pub async fn upload_storage_object(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
from_size_bytes: usize,
to: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
self.upload(from, from_size_bytes, to, None, cancel)
self.upload(from, from_size_bytes, to, None)
.await
.with_context(|| {
format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
@@ -548,11 +554,10 @@ impl GenericRemoteStorage {
&self,
byte_range: Option<(u64, Option<u64>)>,
from: &RemotePath,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
match byte_range {
Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
None => self.download(from, cancel).await,
Some((start, end)) => self.download_byte_range(from, start, end).await,
None => self.download(from).await,
}
}
}
@@ -567,9 +572,6 @@ pub struct StorageMetadata(HashMap<String, String>);
pub struct RemoteStorageConfig {
/// The storage connection configuration.
pub storage: RemoteStorageKind,
/// A common timeout enforced for all requests after concurrency limiter permit has been
/// acquired.
pub timeout: Duration,
}
/// A kind of a remote storage to connect to, with its connection configuration.
@@ -654,8 +656,6 @@ impl Debug for AzureConfig {
}
impl RemoteStorageConfig {
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
let local_path = toml.get("local_path");
let bucket_name = toml.get("bucket_name");
@@ -685,27 +685,6 @@ impl RemoteStorageConfig {
.map(|endpoint| parse_toml_string("endpoint", endpoint))
.transpose()?;
let timeout = toml
.get("timeout")
.map(|timeout| {
timeout
.as_str()
.ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
})
.transpose()
.and_then(|timeout| {
timeout
.map(humantime::parse_duration)
.transpose()
.map_err(anyhow::Error::new)
})
.context("parse timeout")?
.unwrap_or(Self::DEFAULT_TIMEOUT);
if timeout < Duration::from_secs(1) {
bail!("timeout was specified as {timeout:?} which is too low");
}
let storage = match (
local_path,
bucket_name,
@@ -767,7 +746,7 @@ impl RemoteStorageConfig {
}
};
Ok(Some(RemoteStorageConfig { storage, timeout }))
Ok(Some(RemoteStorageConfig { storage }))
}
}
@@ -863,24 +842,4 @@ mod tests {
let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
assert_eq!(err.to_string(), "Path \"/\" is not relative");
}
#[test]
fn parse_localfs_config_with_timeout() {
let input = "local_path = '.'
timeout = '5s'";
let toml = input.parse::<toml_edit::Document>().unwrap();
let config = RemoteStorageConfig::from_toml(toml.as_item())
.unwrap()
.expect("it exists");
assert_eq!(
config,
RemoteStorageConfig {
storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
timeout: Duration::from_secs(5)
}
);
}
}

View File

@@ -5,12 +5,7 @@
//! volume is mounted to the local FS.
use std::{
borrow::Cow,
future::Future,
io::ErrorKind,
num::NonZeroU32,
pin::Pin,
time::{Duration, SystemTime},
borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
};
use anyhow::{bail, ensure, Context};
@@ -25,9 +20,7 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
use crate::{
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
};
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
use super::{RemoteStorage, StorageMetadata};
@@ -36,13 +29,12 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
#[derive(Debug, Clone)]
pub struct LocalFs {
storage_root: Utf8PathBuf,
timeout: Duration,
}
impl LocalFs {
/// Attempts to create local FS storage, along with its root directory.
/// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
pub fn new(mut storage_root: Utf8PathBuf, timeout: Duration) -> anyhow::Result<Self> {
pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result<Self> {
if !storage_root.exists() {
std::fs::create_dir_all(&storage_root).with_context(|| {
format!("Failed to create all directories in the given root path {storage_root:?}")
@@ -54,10 +46,7 @@ impl LocalFs {
})?;
}
Ok(Self {
storage_root,
timeout,
})
Ok(Self { storage_root })
}
// mirrors S3Bucket::s3_object_to_relative_path
@@ -168,14 +157,80 @@ impl LocalFs {
Ok(files)
}
}
async fn upload0(
impl RemoteStorage for LocalFs {
async fn list(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
) -> Result<Listing, DownloadError> {
let mut result = Listing::default();
if let ListingMode::NoDelimiter = mode {
let keys = self
.list_recursive(prefix)
.await
.map_err(DownloadError::Other)?;
result.keys = keys
.into_iter()
.filter(|k| {
let path = k.with_base(&self.storage_root);
!path.is_dir()
})
.collect();
if let Some(max_keys) = max_keys {
result.keys.truncate(max_keys.get() as usize);
}
return Ok(result);
}
let path = match prefix {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root),
};
let prefixes_to_filter = get_all_files(path.as_ref(), false)
.await
.map_err(DownloadError::Other)?;
// filter out empty directories to mirror s3 behavior.
for prefix in prefixes_to_filter {
if prefix.is_dir()
&& is_directory_empty(&prefix)
.await
.map_err(DownloadError::Other)?
{
continue;
}
let stripped = prefix
.strip_prefix(&self.storage_root)
.context("Failed to strip prefix")
.and_then(RemotePath::new)
.expect(
"We list files for storage root, hence should be able to remote the prefix",
);
if prefix.is_dir() {
result.prefixes.push(stripped);
} else {
result.keys.push(stripped);
}
}
Ok(result)
}
async fn upload(
&self,
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let target_file_path = to.with_base(&self.storage_root);
create_target_directory(&target_file_path).await?;
@@ -210,26 +265,9 @@ impl LocalFs {
let mut buffer_to_read = data.take(from_size_bytes);
// alternatively we could just write the bytes to a file, but local_fs is a testing utility
let copy = io::copy_buf(&mut buffer_to_read, &mut destination);
let bytes_read = tokio::select! {
biased;
_ = cancel.cancelled() => {
let file = destination.into_inner();
// wait for the inflight operation(s) to complete so that there could be a next
// attempt right away and our writes are not directed to their file.
file.into_std().await;
// TODO: leave the temp or not? leaving is probably less racy. enabled truncate at
// least.
fs::remove_file(temp_file_path).await.context("remove temp_file_path after cancellation or timeout")?;
return Err(TimeoutOrCancel::Cancel.into());
}
read = copy => read,
};
let bytes_read =
bytes_read.with_context(|| {
let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
.await
.with_context(|| {
format!(
"Failed to upload file (write temp) to the local storage at '{temp_file_path}'",
)
@@ -261,9 +299,6 @@ impl LocalFs {
})?;
if let Some(storage_metadata) = metadata {
// FIXME: we must not be using metadata much, since this would forget the old metadata
// for new writes? or perhaps metadata is sticky; could consider removing if it's never
// used.
let storage_metadata_path = storage_metadata_path(&target_file_path);
fs::write(
&storage_metadata_path,
@@ -280,131 +315,8 @@ impl LocalFs {
Ok(())
}
}
impl RemoteStorage for LocalFs {
async fn list(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Listing, DownloadError> {
let op = async {
let mut result = Listing::default();
if let ListingMode::NoDelimiter = mode {
let keys = self
.list_recursive(prefix)
.await
.map_err(DownloadError::Other)?;
result.keys = keys
.into_iter()
.filter(|k| {
let path = k.with_base(&self.storage_root);
!path.is_dir()
})
.collect();
if let Some(max_keys) = max_keys {
result.keys.truncate(max_keys.get() as usize);
}
return Ok(result);
}
let path = match prefix {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root),
};
let prefixes_to_filter = get_all_files(path.as_ref(), false)
.await
.map_err(DownloadError::Other)?;
// filter out empty directories to mirror s3 behavior.
for prefix in prefixes_to_filter {
if prefix.is_dir()
&& is_directory_empty(&prefix)
.await
.map_err(DownloadError::Other)?
{
continue;
}
let stripped = prefix
.strip_prefix(&self.storage_root)
.context("Failed to strip prefix")
.and_then(RemotePath::new)
.expect(
"We list files for storage root, hence should be able to remote the prefix",
);
if prefix.is_dir() {
result.prefixes.push(stripped);
} else {
result.keys.push(stripped);
}
}
Ok(result)
};
let timeout = async {
tokio::time::sleep(self.timeout).await;
Err(DownloadError::Timeout)
};
let cancelled = async {
cancel.cancelled().await;
Err(DownloadError::Cancelled)
};
tokio::select! {
res = op => res,
res = timeout => res,
res = cancelled => res,
}
}
async fn upload(
&self,
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let cancel = cancel.child_token();
let op = self.upload0(data, data_size_bytes, to, metadata, &cancel);
let mut op = std::pin::pin!(op);
// race the upload0 to the timeout; if it goes over, do a graceful shutdown
let (res, timeout) = tokio::select! {
res = &mut op => (res, false),
_ = tokio::time::sleep(self.timeout) => {
cancel.cancel();
(op.await, true)
}
};
match res {
Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => {
// we caused this cancel (or they happened simultaneously) -- swap it out to
// Timeout
Err(TimeoutOrCancel::Timeout.into())
}
res => res,
}
}
async fn download(
&self,
from: &RemotePath,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
let target_path = from.with_base(&self.storage_root);
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
let source = ReaderStream::new(
@@ -422,10 +334,6 @@ impl RemoteStorage for LocalFs {
.read_storage_metadata(&target_path)
.await
.map_err(DownloadError::Other)?;
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
Ok(Download {
metadata,
last_modified: None,
@@ -442,7 +350,6 @@ impl RemoteStorage for LocalFs {
from: &RemotePath,
start_inclusive: u64,
end_exclusive: Option<u64>,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
if let Some(end_exclusive) = end_exclusive {
if end_exclusive <= start_inclusive {
@@ -484,9 +391,6 @@ impl RemoteStorage for LocalFs {
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
let source = ReaderStream::new(source);
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
Ok(Download {
metadata,
last_modified: None,
@@ -498,7 +402,7 @@ impl RemoteStorage for LocalFs {
}
}
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let file_path = path.with_base(&self.storage_root);
match fs::remove_file(&file_path).await {
Ok(()) => Ok(()),
@@ -510,23 +414,14 @@ impl RemoteStorage for LocalFs {
}
}
async fn delete_objects<'a>(
&self,
paths: &'a [RemotePath],
cancel: &CancellationToken,
) -> anyhow::Result<()> {
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
for path in paths {
self.delete(path, cancel).await?
self.delete(path).await?
}
Ok(())
}
async fn copy(
&self,
from: &RemotePath,
to: &RemotePath,
_cancel: &CancellationToken,
) -> anyhow::Result<()> {
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
let from_path = from.with_base(&self.storage_root);
let to_path = to.with_base(&self.storage_root);
create_target_directory(&to_path).await?;
@@ -540,6 +435,7 @@ impl RemoteStorage for LocalFs {
Ok(())
}
#[allow(clippy::diverging_sub_expression)]
async fn time_travel_recover(
&self,
_prefix: Option<&RemotePath>,
@@ -633,9 +529,8 @@ mod fs_tests {
remote_storage_path: &RemotePath,
expected_metadata: Option<&StorageMetadata>,
) -> anyhow::Result<String> {
let cancel = CancellationToken::new();
let download = storage
.download(remote_storage_path, &cancel)
.download(remote_storage_path)
.await
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
ensure!(
@@ -650,16 +545,16 @@ mod fs_tests {
#[tokio::test]
async fn upload_file() -> anyhow::Result<()> {
let (storage, cancel) = create_storage()?;
let storage = create_storage()?;
let target_path_1 = upload_dummy_file(&storage, "upload_1", None, &cancel).await?;
let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
assert_eq!(
storage.list_all().await?,
vec![target_path_1.clone()],
"Should list a single file after first upload"
);
let target_path_2 = upload_dummy_file(&storage, "upload_2", None, &cancel).await?;
let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?;
assert_eq!(
list_files_sorted(&storage).await?,
vec![target_path_1.clone(), target_path_2.clone()],
@@ -671,7 +566,7 @@ mod fs_tests {
#[tokio::test]
async fn upload_file_negatives() -> anyhow::Result<()> {
let (storage, cancel) = create_storage()?;
let storage = create_storage()?;
let id = RemotePath::new(Utf8Path::new("dummy"))?;
let content = Bytes::from_static(b"12345");
@@ -680,34 +575,34 @@ mod fs_tests {
// Check that you get an error if the size parameter doesn't match the actual
// size of the stream.
storage
.upload(content(), 0, &id, None, &cancel)
.upload(content(), 0, &id, None)
.await
.expect_err("upload with zero size succeeded");
storage
.upload(content(), 4, &id, None, &cancel)
.upload(content(), 4, &id, None)
.await
.expect_err("upload with too short size succeeded");
storage
.upload(content(), 6, &id, None, &cancel)
.upload(content(), 6, &id, None)
.await
.expect_err("upload with too large size succeeded");
// Correct size is 5, this should succeed.
storage.upload(content(), 5, &id, None, &cancel).await?;
storage.upload(content(), 5, &id, None).await?;
Ok(())
}
fn create_storage() -> anyhow::Result<(LocalFs, CancellationToken)> {
fn create_storage() -> anyhow::Result<LocalFs> {
let storage_root = tempdir()?.path().to_path_buf();
LocalFs::new(storage_root, Duration::from_secs(120)).map(|s| (s, CancellationToken::new()))
LocalFs::new(storage_root)
}
#[tokio::test]
async fn download_file() -> anyhow::Result<()> {
let (storage, cancel) = create_storage()?;
let storage = create_storage()?;
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
assert_eq!(
@@ -717,7 +612,7 @@ mod fs_tests {
);
let non_existing_path = "somewhere/else";
match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await {
match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await {
Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
}
@@ -726,9 +621,9 @@ mod fs_tests {
#[tokio::test]
async fn download_file_range_positive() -> anyhow::Result<()> {
let (storage, cancel) = create_storage()?;
let storage = create_storage()?;
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
let full_range_download_contents =
read_and_check_metadata(&storage, &upload_target, None).await?;
@@ -742,12 +637,7 @@ mod fs_tests {
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
let first_part_download = storage
.download_byte_range(
&upload_target,
0,
Some(first_part_local.len() as u64),
&cancel,
)
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
.await?;
assert!(
first_part_download.metadata.is_none(),
@@ -765,7 +655,6 @@ mod fs_tests {
&upload_target,
first_part_local.len() as u64,
Some((first_part_local.len() + second_part_local.len()) as u64),
&cancel,
)
.await?;
assert!(
@@ -780,7 +669,7 @@ mod fs_tests {
);
let suffix_bytes = storage
.download_byte_range(&upload_target, 13, None, &cancel)
.download_byte_range(&upload_target, 13, None)
.await?
.download_stream;
let suffix_bytes = aggregate(suffix_bytes).await?;
@@ -788,7 +677,7 @@ mod fs_tests {
assert_eq!(upload_name, suffix);
let all_bytes = storage
.download_byte_range(&upload_target, 0, None, &cancel)
.download_byte_range(&upload_target, 0, None)
.await?
.download_stream;
let all_bytes = aggregate(all_bytes).await?;
@@ -800,9 +689,9 @@ mod fs_tests {
#[tokio::test]
async fn download_file_range_negative() -> anyhow::Result<()> {
let (storage, cancel) = create_storage()?;
let storage = create_storage()?;
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
let start = 1_000_000_000;
let end = start + 1;
@@ -811,7 +700,6 @@ mod fs_tests {
&upload_target,
start,
Some(end), // exclusive end
&cancel,
)
.await
{
@@ -828,7 +716,7 @@ mod fs_tests {
let end = 234;
assert!(start > end, "Should test an incorrect range");
match storage
.download_byte_range(&upload_target, start, Some(end), &cancel)
.download_byte_range(&upload_target, start, Some(end))
.await
{
Ok(_) => panic!("Should not allow downloading wrong ranges"),
@@ -845,15 +733,15 @@ mod fs_tests {
#[tokio::test]
async fn delete_file() -> anyhow::Result<()> {
let (storage, cancel) = create_storage()?;
let storage = create_storage()?;
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
storage.delete(&upload_target, &cancel).await?;
storage.delete(&upload_target).await?;
assert!(storage.list_all().await?.is_empty());
storage
.delete(&upload_target, &cancel)
.delete(&upload_target)
.await
.expect("Should allow deleting non-existing storage files");
@@ -862,14 +750,14 @@ mod fs_tests {
#[tokio::test]
async fn file_with_metadata() -> anyhow::Result<()> {
let (storage, cancel) = create_storage()?;
let storage = create_storage()?;
let upload_name = "upload_1";
let metadata = StorageMetadata(HashMap::from([
("one".to_string(), "1".to_string()),
("two".to_string(), "2".to_string()),
]));
let upload_target =
upload_dummy_file(&storage, upload_name, Some(metadata.clone()), &cancel).await?;
upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
let full_range_download_contents =
read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
@@ -883,12 +771,7 @@ mod fs_tests {
let (first_part_local, _) = uploaded_bytes.split_at(3);
let partial_download_with_metadata = storage
.download_byte_range(
&upload_target,
0,
Some(first_part_local.len() as u64),
&cancel,
)
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
.await?;
let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
assert_eq!(
@@ -909,20 +792,16 @@ mod fs_tests {
#[tokio::test]
async fn list() -> anyhow::Result<()> {
// No delimiter: should recursively list everything
let (storage, cancel) = create_storage()?;
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
let storage = create_storage()?;
let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
let listing = storage
.list(None, ListingMode::NoDelimiter, None, &cancel)
.await?;
let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
assert!(listing.prefixes.is_empty());
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
// Delimiter: should only go one deep
let listing = storage
.list(None, ListingMode::WithDelimiter, None, &cancel)
.await?;
let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
assert_eq!(
listing.prefixes,
@@ -936,7 +815,6 @@ mod fs_tests {
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?;
assert_eq!(
@@ -949,75 +827,10 @@ mod fs_tests {
Ok(())
}
#[tokio::test]
async fn overwrite_shorter_file() -> anyhow::Result<()> {
let (storage, cancel) = create_storage()?;
let path = RemotePath::new("does/not/matter/file".into())?;
let body = Bytes::from_static(b"long file contents is long");
{
let len = body.len();
let body =
futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
storage.upload(body, len, &path, None, &cancel).await?;
}
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
assert_eq!(body, read);
let shorter = Bytes::from_static(b"shorter body");
{
let len = shorter.len();
let body =
futures::stream::once(futures::future::ready(std::io::Result::Ok(shorter.clone())));
storage.upload(body, len, &path, None, &cancel).await?;
}
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
assert_eq!(shorter, read);
Ok(())
}
#[tokio::test]
async fn cancelled_upload_can_later_be_retried() -> anyhow::Result<()> {
let (storage, cancel) = create_storage()?;
let path = RemotePath::new("does/not/matter/file".into())?;
let body = Bytes::from_static(b"long file contents is long");
{
let len = body.len();
let body =
futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
let cancel = cancel.child_token();
cancel.cancel();
let e = storage
.upload(body, len, &path, None, &cancel)
.await
.unwrap_err();
assert!(TimeoutOrCancel::caused_by_cancel(&e));
}
{
let len = body.len();
let body =
futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
storage.upload(body, len, &path, None, &cancel).await?;
}
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
assert_eq!(body, read);
Ok(())
}
async fn upload_dummy_file(
storage: &LocalFs,
name: &str,
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<RemotePath> {
let from_path = storage
.storage_root
@@ -1039,9 +852,7 @@ mod fs_tests {
let file = tokio_util::io::ReaderStream::new(file);
storage
.upload(file, size, &relative_path, metadata, cancel)
.await?;
storage.upload(file, size, &relative_path, metadata).await?;
Ok(relative_path)
}

View File

@@ -11,7 +11,7 @@ use std::{
pin::Pin,
sync::Arc,
task::{Context, Poll},
time::{Duration, SystemTime},
time::SystemTime,
};
use anyhow::{anyhow, Context as _};
@@ -46,9 +46,9 @@ use utils::backoff;
use super::StorageMetadata;
use crate::{
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
REMOTE_STORAGE_PREFIX_SEPARATOR,
};
pub(super) mod metrics;
@@ -63,8 +63,6 @@ pub struct S3Bucket {
prefix_in_bucket: Option<String>,
max_keys_per_list_response: Option<i32>,
concurrency_limiter: ConcurrencyLimiter,
// Per-request timeout. Accessible for tests.
pub timeout: Duration,
}
struct GetObjectRequest {
@@ -74,7 +72,7 @@ struct GetObjectRequest {
}
impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
tracing::debug!(
"Creating s3 remote storage for S3 bucket {}",
aws_config.bucket_name
@@ -154,7 +152,6 @@ impl S3Bucket {
max_keys_per_list_response: aws_config.max_keys_per_list_response,
prefix_in_bucket,
concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
timeout,
})
}
@@ -188,55 +185,40 @@ impl S3Bucket {
}
}
async fn permit(
&self,
kind: RequestKind,
cancel: &CancellationToken,
) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
let started_at = start_counting_cancelled_wait(kind);
let acquire = self.concurrency_limiter.acquire(kind);
let permit = tokio::select! {
permit = acquire => permit.expect("semaphore is never closed"),
_ = cancel.cancelled() => return Err(Cancelled),
};
let permit = self
.concurrency_limiter
.acquire(kind)
.await
.expect("semaphore is never closed");
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.wait_seconds
.observe_elapsed(kind, started_at);
Ok(permit)
permit
}
async fn owned_permit(
&self,
kind: RequestKind,
cancel: &CancellationToken,
) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
let started_at = start_counting_cancelled_wait(kind);
let acquire = self.concurrency_limiter.acquire_owned(kind);
let permit = tokio::select! {
permit = acquire => permit.expect("semaphore is never closed"),
_ = cancel.cancelled() => return Err(Cancelled),
};
let permit = self
.concurrency_limiter
.acquire_owned(kind)
.await
.expect("semaphore is never closed");
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.wait_seconds
.observe_elapsed(kind, started_at);
Ok(permit)
permit
}
async fn download_object(
&self,
request: GetObjectRequest,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
let kind = RequestKind::Get;
let permit = self.owned_permit(kind, cancel).await?;
let permit = self.owned_permit(kind).await;
let started_at = start_measuring_requests(kind);
@@ -246,13 +228,8 @@ impl S3Bucket {
.bucket(request.bucket)
.key(request.key)
.set_range(request.range)
.send();
let get_object = tokio::select! {
res = get_object => res,
_ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
_ = cancel.cancelled() => return Err(DownloadError::Cancelled),
};
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
@@ -282,10 +259,6 @@ impl S3Bucket {
}
};
// even if we would have no timeout left, continue anyways. the caller can decide to ignore
// the errors considering timeouts and cancellation.
let remaining = self.timeout.saturating_sub(started_at.elapsed());
let metadata = object_output.metadata().cloned().map(StorageMetadata);
let etag = object_output.e_tag;
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
@@ -295,9 +268,6 @@ impl S3Bucket {
let body = PermitCarrying::new(permit, body);
let body = TimedDownload::new(started_at, body);
let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone());
let body = crate::support::DownloadStream::new(cancel_or_timeout, body);
Ok(Download {
metadata,
etag,
@@ -308,44 +278,33 @@ impl S3Bucket {
async fn delete_oids(
&self,
_permit: &tokio::sync::SemaphorePermit<'_>,
kind: RequestKind,
delete_objects: &[ObjectIdentifier],
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Delete;
let mut cancel = std::pin::pin!(cancel.cancelled());
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
let started_at = start_measuring_requests(kind);
let req = self
let resp = self
.client
.delete_objects()
.bucket(self.bucket_name.clone())
.delete(
Delete::builder()
.set_objects(Some(chunk.to_vec()))
.build()
.context("build request")?,
.build()?,
)
.send();
let resp = tokio::select! {
resp = req => resp,
_ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()),
_ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
};
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &resp, started_at);
let resp = resp.context("request deletion")?;
let resp = resp?;
metrics::BUCKET_METRICS
.deleted_objects_total
.inc_by(chunk.len() as u64);
if let Some(errors) = resp.errors {
// Log a bounded number of the errors within the response:
// these requests can carry 1000 keys so logging each one
@@ -361,10 +320,9 @@ impl S3Bucket {
);
}
return Err(anyhow::anyhow!(
"Failed to delete {}/{} objects",
errors.len(),
chunk.len(),
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
));
}
}
@@ -452,7 +410,6 @@ impl RemoteStorage for S3Bucket {
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Listing, DownloadError> {
let kind = RequestKind::List;
// s3 sdk wants i32
@@ -474,11 +431,10 @@ impl RemoteStorage for S3Bucket {
p
});
let _permit = self.permit(kind, cancel).await?;
let mut continuation_token = None;
loop {
let _guard = self.permit(kind).await;
let started_at = start_measuring_requests(kind);
// min of two Options, returning Some if one is value and another is
@@ -500,15 +456,9 @@ impl RemoteStorage for S3Bucket {
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
}
let request = request.send();
let response = tokio::select! {
res = request => res,
_ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
_ = cancel.cancelled() => return Err(DownloadError::Cancelled),
};
let response = response
let response = request
.send()
.await
.context("Failed to list S3 prefixes")
.map_err(DownloadError::Other);
@@ -561,17 +511,16 @@ impl RemoteStorage for S3Bucket {
from_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Put;
let _permit = self.permit(kind, cancel).await?;
let _guard = self.permit(kind).await;
let started_at = start_measuring_requests(kind);
let body = Body::wrap_stream(from);
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
let upload = self
let res = self
.client
.put_object()
.bucket(self.bucket_name.clone())
@@ -579,63 +528,8 @@ impl RemoteStorage for S3Bucket {
.set_metadata(metadata.map(|m| m.0))
.content_length(from_size_bytes.try_into()?)
.body(bytes_stream)
.send();
let upload = tokio::time::timeout(self.timeout, upload);
let res = tokio::select! {
res = upload => res,
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
if let Ok(inner) = &res {
// do not incl. timeouts as errors in metrics but cancellations
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, inner, started_at);
}
match res {
Ok(Ok(_put)) => Ok(()),
Ok(Err(sdk)) => Err(sdk.into()),
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
}
}
async fn copy(
&self,
from: &RemotePath,
to: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::Copy;
let _permit = self.permit(kind, cancel).await?;
let timeout = tokio::time::sleep(self.timeout);
let started_at = start_measuring_requests(kind);
// we need to specify bucket_name as a prefix
let copy_source = format!(
"{}/{}",
self.bucket_name,
self.relative_path_to_s3_object(from)
);
let op = self
.client
.copy_object()
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to))
.copy_source(copy_source)
.send();
let res = tokio::select! {
res = op => res,
_ = timeout => return Err(TimeoutOrCancel::Timeout.into()),
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
@@ -647,21 +541,46 @@ impl RemoteStorage for S3Bucket {
Ok(())
}
async fn download(
&self,
from: &RemotePath,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
let kind = RequestKind::Copy;
let _guard = self.permit(kind).await;
let started_at = start_measuring_requests(kind);
// we need to specify bucket_name as a prefix
let copy_source = format!(
"{}/{}",
self.bucket_name,
self.relative_path_to_s3_object(from)
);
let res = self
.client
.copy_object()
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to))
.copy_source(copy_source)
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res?;
Ok(())
}
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
// if prefix is not none then download file `prefix/from`
// if prefix is none then download file `from`
self.download_object(
GetObjectRequest {
bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from),
range: None,
},
cancel,
)
self.download_object(GetObjectRequest {
bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from),
range: None,
})
.await
}
@@ -670,7 +589,6 @@ impl RemoteStorage for S3Bucket {
from: &RemotePath,
start_inclusive: u64,
end_exclusive: Option<u64>,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
// S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
// and needs both ends to be exclusive
@@ -680,39 +598,31 @@ impl RemoteStorage for S3Bucket {
None => format!("bytes={start_inclusive}-"),
});
self.download_object(
GetObjectRequest {
bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from),
range,
},
cancel,
)
self.download_object(GetObjectRequest {
bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from),
range,
})
.await
}
async fn delete_objects<'a>(
&self,
paths: &'a [RemotePath],
cancel: &CancellationToken,
) -> anyhow::Result<()> {
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
let kind = RequestKind::Delete;
let permit = self.permit(kind, cancel).await?;
let _guard = self.permit(kind).await;
let mut delete_objects = Vec::with_capacity(paths.len());
for path in paths {
let obj_id = ObjectIdentifier::builder()
.set_key(Some(self.relative_path_to_s3_object(path)))
.build()
.context("convert path to oid")?;
.build()?;
delete_objects.push(obj_id);
}
self.delete_oids(&permit, &delete_objects, cancel).await
self.delete_oids(kind, &delete_objects).await
}
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let paths = std::array::from_ref(path);
self.delete_objects(paths, cancel).await
self.delete_objects(paths).await
}
async fn time_travel_recover(
@@ -723,7 +633,7 @@ impl RemoteStorage for S3Bucket {
cancel: &CancellationToken,
) -> Result<(), TimeTravelError> {
let kind = RequestKind::TimeTravel;
let permit = self.permit(kind, cancel).await?;
let _guard = self.permit(kind).await;
let timestamp = DateTime::from(timestamp);
let done_if_after = DateTime::from(done_if_after);
@@ -737,7 +647,7 @@ impl RemoteStorage for S3Bucket {
let warn_threshold = 3;
let max_retries = 10;
let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
let is_permanent = |_e: &_| false;
let mut key_marker = None;
let mut version_id_marker = None;
@@ -746,19 +656,15 @@ impl RemoteStorage for S3Bucket {
loop {
let response = backoff::retry(
|| async {
let op = self
.client
self.client
.list_object_versions()
.bucket(self.bucket_name.clone())
.set_prefix(prefix.clone())
.set_key_marker(key_marker.clone())
.set_version_id_marker(version_id_marker.clone())
.send();
tokio::select! {
res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
_ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
}
.send()
.await
.map_err(|e| TimeTravelError::Other(e.into()))
},
is_permanent,
warn_threshold,
@@ -880,18 +786,14 @@ impl RemoteStorage for S3Bucket {
backoff::retry(
|| async {
let op = self
.client
self.client
.copy_object()
.bucket(self.bucket_name.clone())
.key(key)
.copy_source(&source_id)
.send();
tokio::select! {
res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
_ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
}
.send()
.await
.map_err(|e| TimeTravelError::Other(e.into()))
},
is_permanent,
warn_threshold,
@@ -922,18 +824,10 @@ impl RemoteStorage for S3Bucket {
let oid = ObjectIdentifier::builder()
.key(key.to_owned())
.build()
.map_err(|e| TimeTravelError::Other(e.into()))?;
self.delete_oids(&permit, &[oid], cancel)
.map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
self.delete_oids(kind, &[oid])
.await
.map_err(|e| {
// delete_oid0 will use TimeoutOrCancel
if TimeoutOrCancel::caused_by_cancel(&e) {
TimeTravelError::Cancelled
} else {
TimeTravelError::Other(e)
}
})?;
.map_err(TimeTravelError::Other)?;
}
}
}
@@ -1069,8 +963,7 @@ mod tests {
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: Some(5),
};
let storage =
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
let storage = S3Bucket::new(&config).expect("remote storage init");
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
let result = storage.relative_path_to_s3_object(test_path);
let expected = expected_outputs[prefix_idx][test_path_idx];

View File

@@ -90,16 +90,11 @@ impl UnreliableWrapper {
}
}
async fn delete_inner(
&self,
path: &RemotePath,
attempt: bool,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
if attempt {
self.attempt(RemoteOp::Delete(path.clone()))?;
}
self.inner.delete(path, cancel).await
self.inner.delete(path).await
}
}
@@ -110,22 +105,20 @@ impl RemoteStorage for UnreliableWrapper {
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
.map_err(DownloadError::Other)?;
self.inner.list_prefixes(prefix, cancel).await
self.inner.list_prefixes(prefix).await
}
async fn list_files(
&self,
folder: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
.map_err(DownloadError::Other)?;
self.inner.list_files(folder, max_keys, cancel).await
self.inner.list_files(folder, max_keys).await
}
async fn list(
@@ -133,11 +126,10 @@ impl RemoteStorage for UnreliableWrapper {
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Listing, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
.map_err(DownloadError::Other)?;
self.inner.list(prefix, mode, max_keys, cancel).await
self.inner.list(prefix, mode, max_keys).await
}
async fn upload(
@@ -148,22 +140,15 @@ impl RemoteStorage for UnreliableWrapper {
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
self.attempt(RemoteOp::Upload(to.clone()))?;
self.inner
.upload(data, data_size_bytes, to, metadata, cancel)
.await
self.inner.upload(data, data_size_bytes, to, metadata).await
}
async fn download(
&self,
from: &RemotePath,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
self.attempt(RemoteOp::Download(from.clone()))
.map_err(DownloadError::Other)?;
self.inner.download(from, cancel).await
self.inner.download(from).await
}
async fn download_byte_range(
@@ -171,7 +156,6 @@ impl RemoteStorage for UnreliableWrapper {
from: &RemotePath,
start_inclusive: u64,
end_exclusive: Option<u64>,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
// Note: We treat any download_byte_range as an "attempt" of the same
// operation. We don't pay attention to the ranges. That's good enough
@@ -179,24 +163,20 @@ impl RemoteStorage for UnreliableWrapper {
self.attempt(RemoteOp::Download(from.clone()))
.map_err(DownloadError::Other)?;
self.inner
.download_byte_range(from, start_inclusive, end_exclusive, cancel)
.download_byte_range(from, start_inclusive, end_exclusive)
.await
}
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
self.delete_inner(path, true, cancel).await
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
self.delete_inner(path, true).await
}
async fn delete_objects<'a>(
&self,
paths: &'a [RemotePath],
cancel: &CancellationToken,
) -> anyhow::Result<()> {
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
let mut error_counter = 0;
for path in paths {
// Dont record attempt because it was already recorded above
if (self.delete_inner(path, false, cancel).await).is_err() {
if (self.delete_inner(path, false).await).is_err() {
error_counter += 1;
}
}
@@ -209,16 +189,11 @@ impl RemoteStorage for UnreliableWrapper {
Ok(())
}
async fn copy(
&self,
from: &RemotePath,
to: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
// copy is equivalent to download + upload
self.attempt(RemoteOp::Download(from.clone()))?;
self.attempt(RemoteOp::Upload(to.clone()))?;
self.inner.copy_object(from, to, cancel).await
self.inner.copy_object(from, to).await
}
async fn time_travel_recover(

View File

@@ -1,15 +1,9 @@
use std::{
future::Future,
pin::Pin,
task::{Context, Poll},
time::Duration,
};
use bytes::Bytes;
use futures_util::Stream;
use tokio_util::sync::CancellationToken;
use crate::TimeoutOrCancel;
pin_project_lite::pin_project! {
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
@@ -37,139 +31,3 @@ impl<S: Stream> Stream for PermitCarrying<S> {
self.inner.size_hint()
}
}
pin_project_lite::pin_project! {
pub(crate) struct DownloadStream<F, S> {
hit: bool,
#[pin]
cancellation: F,
#[pin]
inner: S,
}
}
impl<F, S> DownloadStream<F, S> {
pub(crate) fn new(cancellation: F, inner: S) -> Self {
Self {
cancellation,
hit: false,
inner,
}
}
}
/// See documentation on [`crate::DownloadStream`] on rationale why `std::io::Error` is used.
impl<E, F, S> Stream for DownloadStream<F, S>
where
std::io::Error: From<E>,
F: Future<Output = E>,
S: Stream<Item = std::io::Result<Bytes>>,
{
type Item = <S as Stream>::Item;
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
let this = self.project();
if !*this.hit {
if let Poll::Ready(e) = this.cancellation.poll(cx) {
*this.hit = true;
// most likely this will be a std::io::Error wrapping a DownloadError
let e = Err(std::io::Error::from(e));
return Poll::Ready(Some(e));
}
}
this.inner.poll_next(cx)
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
/// Fires only on the first cancel or timeout, not on both.
pub(crate) async fn cancel_or_timeout(
timeout: Duration,
cancel: CancellationToken,
) -> TimeoutOrCancel {
tokio::select! {
_ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
_ = cancel.cancelled() => TimeoutOrCancel::Cancel,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::DownloadError;
use futures::stream::StreamExt;
#[tokio::test(start_paused = true)]
async fn cancelled_download_stream() {
let inner = futures::stream::pending();
let timeout = Duration::from_secs(120);
let cancel = CancellationToken::new();
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
let mut stream = std::pin::pin!(stream);
let mut first = stream.next();
tokio::select! {
_ = &mut first => unreachable!("we haven't yet cancelled nor is timeout passed"),
_ = tokio::time::sleep(Duration::from_secs(1)) => {},
}
cancel.cancel();
let e = first.await.expect("there must be some").unwrap_err();
assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
let inner = e.get_ref().expect("inner should be set");
assert!(
inner
.downcast_ref::<DownloadError>()
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
"{inner:?}"
);
let e = DownloadError::from(e);
assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
tokio::select! {
_ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
_ = tokio::time::sleep(Duration::from_secs(121)) => {},
}
}
#[tokio::test(start_paused = true)]
async fn timeouted_download_stream() {
let inner = futures::stream::pending();
let timeout = Duration::from_secs(120);
let cancel = CancellationToken::new();
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
let mut stream = std::pin::pin!(stream);
// because the stream uses 120s timeout and we are paused, we advance to 120s right away.
let first = stream.next();
let e = first.await.expect("there must be some").unwrap_err();
assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
let inner = e.get_ref().expect("inner should be set");
assert!(
inner
.downcast_ref::<DownloadError>()
.is_some_and(|e| matches!(e, DownloadError::Timeout)),
"{inner:?}"
);
let e = DownloadError::from(e);
assert!(matches!(e, DownloadError::Timeout), "{e:?}");
cancel.cancel();
tokio::select! {
_ = stream.next() => unreachable!("no cancellation ever happens because we already timed out"),
_ = tokio::time::sleep(Duration::from_secs(121)) => {},
}
}
}

View File

@@ -10,7 +10,6 @@ use futures::stream::Stream;
use once_cell::sync::OnceCell;
use remote_storage::{Download, GenericRemoteStorage, RemotePath};
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info};
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
@@ -59,12 +58,8 @@ pub(crate) async fn upload_simple_remote_data(
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
info!("Creating {upload_tasks_count} remote files");
let mut upload_tasks = JoinSet::new();
let cancel = CancellationToken::new();
for i in 1..upload_tasks_count + 1 {
let task_client = Arc::clone(client);
let cancel = cancel.clone();
upload_tasks.spawn(async move {
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
let blob_path = RemotePath::new(
@@ -74,9 +69,7 @@ pub(crate) async fn upload_simple_remote_data(
debug!("Creating remote item {i} at path {blob_path:?}");
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
task_client
.upload(data, len, &blob_path, None, &cancel)
.await?;
task_client.upload(data, len, &blob_path, None).await?;
Ok::<_, anyhow::Error>(blob_path)
});
@@ -114,15 +107,13 @@ pub(crate) async fn cleanup(
"Removing {} objects from the remote storage during cleanup",
objects_to_delete.len()
);
let cancel = CancellationToken::new();
let mut delete_tasks = JoinSet::new();
for object_to_delete in objects_to_delete {
let task_client = Arc::clone(client);
let cancel = cancel.clone();
delete_tasks.spawn(async move {
debug!("Deleting remote item at path {object_to_delete:?}");
task_client
.delete(&object_to_delete, &cancel)
.delete(&object_to_delete)
.await
.with_context(|| format!("{object_to_delete:?} removal"))
});
@@ -150,12 +141,8 @@ pub(crate) async fn upload_remote_data(
) -> ControlFlow<Uploads, Uploads> {
info!("Creating {upload_tasks_count} remote files");
let mut upload_tasks = JoinSet::new();
let cancel = CancellationToken::new();
for i in 1..upload_tasks_count + 1 {
let task_client = Arc::clone(client);
let cancel = cancel.clone();
upload_tasks.spawn(async move {
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
@@ -165,9 +152,7 @@ pub(crate) async fn upload_remote_data(
let (data, data_len) =
upload_stream(format!("remote blob data {i}").into_bytes().into());
task_client
.upload(data, data_len, &blob_path, None, &cancel)
.await?;
task_client.upload(data, data_len, &blob_path, None).await?;
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
});

View File

@@ -4,7 +4,6 @@ use remote_storage::RemotePath;
use std::sync::Arc;
use std::{collections::HashSet, num::NonZeroU32};
use test_context::test_context;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use crate::common::{download_to_vec, upload_stream, wrap_stream};
@@ -46,15 +45,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
}
};
let cancel = CancellationToken::new();
let test_client = Arc::clone(&ctx.enabled.client);
let expected_remote_prefixes = ctx.remote_prefixes.clone();
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
.context("common_prefix construction")?;
let root_remote_prefixes = test_client
.list_prefixes(None, &cancel)
.list_prefixes(None)
.await
.context("client list root prefixes failure")?
.into_iter()
@@ -65,7 +62,7 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
);
let nested_remote_prefixes = test_client
.list_prefixes(Some(&base_prefix), &cancel)
.list_prefixes(Some(&base_prefix))
.await
.context("client list nested prefixes failure")?
.into_iter()
@@ -102,12 +99,11 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
anyhow::bail!("S3 init failed: {e:?}")
}
};
let cancel = CancellationToken::new();
let test_client = Arc::clone(&ctx.enabled.client);
let base_prefix =
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
let root_files = test_client
.list_files(None, None, &cancel)
.list_files(None, None)
.await
.context("client list root files failure")?
.into_iter()
@@ -121,13 +117,13 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
// Test that max_keys limit works. In total there are about 21 files (see
// upload_simple_remote_data call in test_real_s3.rs).
let limited_root_files = test_client
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
.list_files(None, Some(NonZeroU32::new(2).unwrap()))
.await
.context("client list root files failure")?;
assert_eq!(limited_root_files.len(), 2);
let nested_remote_files = test_client
.list_files(Some(&base_prefix), None, &cancel)
.list_files(Some(&base_prefix), None)
.await
.context("client list nested files failure")?
.into_iter()
@@ -154,17 +150,12 @@ async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Resu
MaybeEnabledStorage::Disabled => return Ok(()),
};
let cancel = CancellationToken::new();
let path = RemotePath::new(Utf8Path::new(
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
))
.with_context(|| "RemotePath conversion")?;
ctx.client
.delete(&path, &cancel)
.await
.expect("should succeed");
ctx.client.delete(&path).await.expect("should succeed");
Ok(())
}
@@ -177,8 +168,6 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
MaybeEnabledStorage::Disabled => return Ok(()),
};
let cancel = CancellationToken::new();
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
@@ -189,21 +178,21 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
.with_context(|| "RemotePath conversion")?;
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
ctx.client.upload(data, len, &path1, None, &cancel).await?;
ctx.client.upload(data, len, &path1, None).await?;
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
ctx.client.upload(data, len, &path2, None, &cancel).await?;
ctx.client.upload(data, len, &path2, None).await?;
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
ctx.client.upload(data, len, &path3, None, &cancel).await?;
ctx.client.upload(data, len, &path3, None).await?;
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
ctx.client.delete_objects(&[path1, path2]).await?;
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
let prefixes = ctx.client.list_prefixes(None).await?;
assert_eq!(prefixes.len(), 1);
ctx.client.delete_objects(&[path3], &cancel).await?;
ctx.client.delete_objects(&[path3]).await?;
Ok(())
}
@@ -215,8 +204,6 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
return Ok(());
};
let cancel = CancellationToken::new();
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
@@ -224,56 +211,47 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
let (data, len) = wrap_stream(orig.clone());
ctx.client.upload(data, len, &path, None, &cancel).await?;
ctx.client.upload(data, len, &path, None).await?;
// Normal download request
let dl = ctx.client.download(&path, &cancel).await?;
let dl = ctx.client.download(&path).await?;
let buf = download_to_vec(dl).await?;
assert_eq!(&buf, &orig);
// Full range (end specified)
let dl = ctx
.client
.download_byte_range(&path, 0, Some(len as u64), &cancel)
.download_byte_range(&path, 0, Some(len as u64))
.await?;
let buf = download_to_vec(dl).await?;
assert_eq!(&buf, &orig);
// partial range (end specified)
let dl = ctx
.client
.download_byte_range(&path, 4, Some(10), &cancel)
.await?;
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
let buf = download_to_vec(dl).await?;
assert_eq!(&buf, &orig[4..10]);
// partial range (end beyond real end)
let dl = ctx
.client
.download_byte_range(&path, 8, Some(len as u64 * 100), &cancel)
.download_byte_range(&path, 8, Some(len as u64 * 100))
.await?;
let buf = download_to_vec(dl).await?;
assert_eq!(&buf, &orig[8..]);
// Partial range (end unspecified)
let dl = ctx
.client
.download_byte_range(&path, 4, None, &cancel)
.await?;
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
let buf = download_to_vec(dl).await?;
assert_eq!(&buf, &orig[4..]);
// Full range (end unspecified)
let dl = ctx
.client
.download_byte_range(&path, 0, None, &cancel)
.await?;
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
let buf = download_to_vec(dl).await?;
assert_eq!(&buf, &orig);
debug!("Cleanup: deleting file at path {path:?}");
ctx.client
.delete(&path, &cancel)
.delete(&path)
.await
.with_context(|| format!("{path:?} removal"))?;
@@ -287,8 +265,6 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
return Ok(());
};
let cancel = CancellationToken::new();
let path = RemotePath::new(Utf8Path::new(
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
))
@@ -302,18 +278,18 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
let (data, len) = wrap_stream(orig.clone());
ctx.client.upload(data, len, &path, None, &cancel).await?;
ctx.client.upload(data, len, &path, None).await?;
// Normal download request
ctx.client.copy_object(&path, &path_dest, &cancel).await?;
ctx.client.copy_object(&path, &path_dest).await?;
let dl = ctx.client.download(&path_dest, &cancel).await?;
let dl = ctx.client.download(&path_dest).await?;
let buf = download_to_vec(dl).await?;
assert_eq!(&buf, &orig);
debug!("Cleanup: deleting file at path {path:?}");
ctx.client
.delete_objects(&[path.clone(), path_dest.clone()], &cancel)
.delete_objects(&[path.clone(), path_dest.clone()])
.await
.with_context(|| format!("{path:?} removal"))?;

View File

@@ -1,9 +1,9 @@
use std::collections::HashSet;
use std::env;
use std::num::NonZeroUsize;
use std::ops::ControlFlow;
use std::sync::Arc;
use std::time::UNIX_EPOCH;
use std::{collections::HashSet, time::Duration};
use anyhow::Context;
use remote_storage::{
@@ -39,17 +39,6 @@ impl EnabledAzure {
base_prefix: BASE_PREFIX,
}
}
#[allow(unused)] // this will be needed when moving the timeout integration tests back
fn configure_request_timeout(&mut self, timeout: Duration) {
match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
GenericRemoteStorage::AzureBlob(azure) => {
let azure = Arc::get_mut(azure).expect("inner Arc::get_mut");
azure.timeout = timeout;
}
_ => unreachable!(),
}
}
}
enum MaybeEnabledStorage {
@@ -224,7 +213,6 @@ fn create_azure_client(
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response,
}),
timeout: Duration::from_secs(120),
};
Ok(Arc::new(
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,

View File

@@ -1,6 +1,5 @@
use std::env;
use std::fmt::{Debug, Display};
use std::future::Future;
use std::num::NonZeroUsize;
use std::ops::ControlFlow;
use std::sync::Arc;
@@ -10,10 +9,9 @@ use std::{collections::HashSet, time::SystemTime};
use crate::common::{download_to_vec, upload_stream};
use anyhow::Context;
use camino::Utf8Path;
use futures_util::StreamExt;
use futures_util::Future;
use remote_storage::{
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
S3Config,
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
};
use test_context::test_context;
use test_context::AsyncTestContext;
@@ -29,6 +27,7 @@ use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_re
use utils::backoff;
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
const BASE_PREFIX: &str = "test";
#[test_context(MaybeEnabledStorage)]
@@ -70,11 +69,8 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
ret
}
async fn list_files(
client: &Arc<GenericRemoteStorage>,
cancel: &CancellationToken,
) -> anyhow::Result<HashSet<RemotePath>> {
Ok(retry(|| client.list_files(None, None, cancel))
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
Ok(retry(|| client.list_files(None, None))
.await
.context("list root files failure")?
.into_iter()
@@ -94,11 +90,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
retry(|| {
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
ctx.client.upload(data, len, &path1, None, &cancel)
ctx.client.upload(data, len, &path1, None)
})
.await?;
let t0_files = list_files(&ctx.client, &cancel).await?;
let t0_files = list_files(&ctx.client).await?;
let t0 = time_point().await;
println!("at t0: {t0_files:?}");
@@ -106,17 +102,17 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
retry(|| {
let (data, len) = upload_stream(old_data.as_bytes().into());
ctx.client.upload(data, len, &path2, None, &cancel)
ctx.client.upload(data, len, &path2, None)
})
.await?;
let t1_files = list_files(&ctx.client, &cancel).await?;
let t1_files = list_files(&ctx.client).await?;
let t1 = time_point().await;
println!("at t1: {t1_files:?}");
// A little check to ensure that our clock is not too far off from the S3 clock
{
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
let dl = retry(|| ctx.client.download(&path2)).await?;
let last_modified = dl.last_modified.unwrap();
let half_wt = WAIT_TIME.mul_f32(0.5);
let t0_hwt = t0 + half_wt;
@@ -129,7 +125,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
retry(|| {
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
ctx.client.upload(data, len, &path3, None, &cancel)
ctx.client.upload(data, len, &path3, None)
})
.await?;
@@ -137,12 +133,12 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
retry(|| {
let (data, len) = upload_stream(new_data.as_bytes().into());
ctx.client.upload(data, len, &path2, None, &cancel)
ctx.client.upload(data, len, &path2, None)
})
.await?;
retry(|| ctx.client.delete(&path1, &cancel)).await?;
let t2_files = list_files(&ctx.client, &cancel).await?;
retry(|| ctx.client.delete(&path1)).await?;
let t2_files = list_files(&ctx.client).await?;
let t2 = time_point().await;
println!("at t2: {t2_files:?}");
@@ -151,10 +147,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
ctx.client
.time_travel_recover(None, t2, t_final, &cancel)
.await?;
let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
let t2_files_recovered = list_files(&ctx.client).await?;
println!("after recovery to t2: {t2_files_recovered:?}");
assert_eq!(t2_files, t2_files_recovered);
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
assert_eq!(path2_recovered_t2, new_data.as_bytes());
// after recovery to t1: path1 is back, path2 has the old content
@@ -162,10 +158,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
ctx.client
.time_travel_recover(None, t1, t_final, &cancel)
.await?;
let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
let t1_files_recovered = list_files(&ctx.client).await?;
println!("after recovery to t1: {t1_files_recovered:?}");
assert_eq!(t1_files, t1_files_recovered);
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
assert_eq!(path2_recovered_t1, old_data.as_bytes());
// after recovery to t0: everything is gone except for path1
@@ -173,14 +169,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
ctx.client
.time_travel_recover(None, t0, t_final, &cancel)
.await?;
let t0_files_recovered = list_files(&ctx.client, &cancel).await?;
let t0_files_recovered = list_files(&ctx.client).await?;
println!("after recovery to t0: {t0_files_recovered:?}");
assert_eq!(t0_files, t0_files_recovered);
// cleanup
let paths = &[path1, path2, path3];
retry(|| ctx.client.delete_objects(paths, &cancel)).await?;
retry(|| ctx.client.delete_objects(paths)).await?;
Ok(())
}
@@ -201,16 +197,6 @@ impl EnabledS3 {
base_prefix: BASE_PREFIX,
}
}
fn configure_request_timeout(&mut self, timeout: Duration) {
match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
GenericRemoteStorage::AwsS3(s3) => {
let s3 = Arc::get_mut(s3).expect("inner Arc::get_mut");
s3.timeout = timeout;
}
_ => unreachable!(),
}
}
}
enum MaybeEnabledStorage {
@@ -384,169 +370,8 @@ fn create_s3_client(
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response,
}),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
Ok(Arc::new(
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
))
}
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) {
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
return;
};
let cancel = CancellationToken::new();
let path = RemotePath::new(Utf8Path::new(
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
))
.unwrap();
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
let timeout = std::time::Duration::from_secs(5);
ctx.configure_request_timeout(timeout);
let started_at = std::time::Instant::now();
let mut stream = ctx
.client
.download(&path, &cancel)
.await
.expect("download succeeds")
.download_stream;
if started_at.elapsed().mul_f32(0.9) >= timeout {
tracing::warn!(
elapsed_ms = started_at.elapsed().as_millis(),
"timeout might be too low, consumed most of it during headers"
);
}
let first = stream
.next()
.await
.expect("should have the first blob")
.expect("should have succeeded");
tracing::info!(len = first.len(), "downloaded first chunk");
assert!(
first.len() < len,
"uploaded file is too small, we downloaded all on first chunk"
);
tokio::time::sleep(timeout).await;
{
let started_at = std::time::Instant::now();
let next = stream
.next()
.await
.expect("stream should not have ended yet");
tracing::info!(
next.is_err = next.is_err(),
elapsed_ms = started_at.elapsed().as_millis(),
"received item after timeout"
);
let e = next.expect_err("expected an error, but got a chunk?");
let inner = e.get_ref().expect("std::io::Error::inner should be set");
assert!(
inner
.downcast_ref::<DownloadError>()
.is_some_and(|e| matches!(e, DownloadError::Timeout)),
"{inner:?}"
);
}
ctx.configure_request_timeout(RemoteStorageConfig::DEFAULT_TIMEOUT);
ctx.client.delete_objects(&[path], &cancel).await.unwrap()
}
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
return;
};
let cancel = CancellationToken::new();
let path = RemotePath::new(Utf8Path::new(
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
))
.unwrap();
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
{
let mut stream = ctx
.client
.download(&path, &cancel)
.await
.expect("download succeeds")
.download_stream;
let first = stream
.next()
.await
.expect("should have the first blob")
.expect("should have succeeded");
tracing::info!(len = first.len(), "downloaded first chunk");
assert!(
first.len() < len,
"uploaded file is too small, we downloaded all on first chunk"
);
cancel.cancel();
let next = stream.next().await.expect("stream should have more");
let e = next.expect_err("expected an error, but got a chunk?");
let inner = e.get_ref().expect("std::io::Error::inner should be set");
assert!(
inner
.downcast_ref::<DownloadError>()
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
"{inner:?}"
);
}
let cancel = CancellationToken::new();
ctx.client.delete_objects(&[path], &cancel).await.unwrap();
}
/// Upload a long enough file so that we cannot download it in single chunk
///
/// For s3 the first chunk seems to be less than 10kB, so this has a bit of a safety margin
async fn upload_large_enough_file(
client: &GenericRemoteStorage,
path: &RemotePath,
cancel: &CancellationToken,
) -> usize {
let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
let body = bytes::Bytes::from(vec![0u8; 1024]);
let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128));
let len = contents.clone().fold(0, |acc, next| acc + next.len());
let contents = futures::stream::iter(contents.map(std::io::Result::Ok));
client
.upload(contents, len, path, None, cancel)
.await
.expect("upload succeeds");
len
}

View File

@@ -25,7 +25,6 @@ hyper = { workspace = true, features = ["full"] }
fail.workspace = true
futures = { workspace = true}
jsonwebtoken.workspace = true
leaky-bucket.workspace = true
nix.workspace = true
once_cell.workspace = true
pin-project-lite.workspace = true

View File

@@ -1,3 +1,5 @@
#![allow(unused)]
use criterion::{criterion_group, criterion_main, Criterion};
use utils::id;

View File

@@ -29,11 +29,6 @@ pub enum Scope {
// Should only be used e.g. for status check.
// Currently also used for connection from any pageserver to any safekeeper.
SafekeeperData,
// The scope used by pageservers in upcalls to storage controller and cloud control plane
#[serde(rename = "generations_api")]
GenerationsApi,
// Allows access to control plane managment API and some storage controller endpoints.
Admin,
}
/// JWT payload. See docs/authentication.md for the format

View File

@@ -1,7 +1,7 @@
use std::{
borrow::Cow,
fs::{self, File},
io::{self, Write},
io,
};
use camino::{Utf8Path, Utf8PathBuf};
@@ -161,48 +161,6 @@ pub async fn durable_rename(
Ok(())
}
/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
///
/// The file is first written to the specified `tmp_path`, and in a second
/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
/// and atomic rename guarantee that, if we crash at any point, there will never
/// be a partially written file at `final_path` (but maybe at `tmp_path`).
///
/// Callers are responsible for serializing calls of this function for a given `final_path`.
/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
/// be no error and the content of `final_path` will be the "winner" caller's `content`.
/// I.e., the atomticity guarantees still hold.
pub fn overwrite(
final_path: &Utf8Path,
tmp_path: &Utf8Path,
content: &[u8],
) -> std::io::Result<()> {
let Some(final_path_parent) = final_path.parent() else {
return Err(std::io::Error::from_raw_os_error(
nix::errno::Errno::EINVAL as i32,
));
};
std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
let mut file = std::fs::OpenOptions::new()
.write(true)
// Use `create_new` so that, if we race with ourselves or something else,
// we bail out instead of causing damage.
.create_new(true)
.open(tmp_path)?;
file.write_all(content)?;
file.sync_all()?;
drop(file); // don't keep the fd open for longer than we have to
std::fs::rename(tmp_path, final_path)?;
let final_parent_dirfd = std::fs::OpenOptions::new()
.read(true)
.open(final_path_parent)?;
final_parent_dirfd.sync_all()?;
Ok(())
}
#[cfg(test)]
mod tests {

View File

@@ -12,7 +12,6 @@ testing = ["fail/failpoints"]
[dependencies]
anyhow.workspace = true
arc-swap.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
@@ -36,7 +35,6 @@ humantime.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
itertools.workspace = true
leaky-bucket.workspace = true
md5.workspace = true
nix.workspace = true
# hack to get the number of worker threads tokio uses
@@ -73,7 +71,6 @@ url.workspace = true
walkdir.workspace = true
metrics.workspace = true
pageserver_api.workspace = true
pageserver_compaction.workspace = true
postgres_connection.workspace = true
postgres_ffi.workspace = true
pq_proto.workspace = true
@@ -85,7 +82,7 @@ workspace_hack.workspace = true
reqwest.workspace = true
rpds.workspace = true
enum-map.workspace = true
enumset = { workspace = true, features = ["serde"]}
enumset.workspace = true
strum.workspace = true
strum_macros.workspace = true

View File

@@ -6,28 +6,14 @@
//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
//! logging what happens when a sequential scan is requested on a small table, then picking out two
//! suitable from logs.
//!
//!
//! Reference data (git blame to see commit) on an i3en.3xlarge
// ```text
//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs]
//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs]
//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs]
//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs]
//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs]
//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs]
//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs]
//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms]
//! ``
use std::sync::Arc;
use std::sync::{Arc, Barrier};
use bytes::{Buf, Bytes};
use pageserver::{
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
};
use pageserver_api::shard::TenantShardId;
use tokio::task::JoinSet;
use utils::{id::TenantId, lsn::Lsn};
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -53,11 +39,11 @@ fn redo_scenarios(c: &mut Criterion) {
.build()
.unwrap();
tracing::info!("executing first");
rt.block_on(short().execute(&manager)).unwrap();
short().execute(rt.handle(), &manager).unwrap();
tracing::info!("first executed");
}
let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
let thread_counts = [1, 2, 4, 8, 16];
let mut group = c.benchmark_group("short");
group.sampling_mode(criterion::SamplingMode::Flat);
@@ -88,69 +74,114 @@ fn redo_scenarios(c: &mut Criterion) {
drop(group);
}
/// Sets up a multi-threaded tokio runtime with default worker thread count,
/// then, spawn `requesters` tasks that repeatedly:
/// - get input from `input_factor()`
/// - call `manager.request_redo()` with their input
///
/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
///
/// Using tokio's default worker thread count means the results will differ on machines
/// with different core countrs. We don't care about that, the performance will always
/// be different on different hardware. To compare performance of different software versions,
/// use the same hardware.
/// Sets up `threads` number of requesters to `request_redo`, with the given input.
fn add_multithreaded_walredo_requesters(
b: &mut criterion::Bencher,
nrequesters: usize,
threads: u32,
manager: &Arc<PostgresRedoManager>,
input_factory: fn() -> Request,
) {
assert_ne!(nrequesters, 0);
assert_ne!(threads, 0);
let rt = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.unwrap();
if threads == 1 {
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let handle = rt.handle();
b.iter_batched_ref(
|| Some(input_factory()),
|input| execute_all(input.take(), handle, manager),
criterion::BatchSize::PerIteration,
);
} else {
let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize);
let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx));
let mut requesters = JoinSet::new();
for _ in 0..nrequesters {
let _entered = rt.enter();
let manager = manager.clone();
let barrier = barrier.clone();
requesters.spawn(async move {
loop {
let input = input_factory();
barrier.wait().await;
let page = input.execute(&manager).await.unwrap();
assert_eq!(page.remaining(), 8192);
barrier.wait().await;
}
});
let barrier = Arc::new(Barrier::new(threads as usize + 1));
let jhs = (0..threads)
.map(|_| {
std::thread::spawn({
let manager = manager.clone();
let barrier = barrier.clone();
let work_rx = work_rx.clone();
move || {
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let handle = rt.handle();
loop {
// queue up and wait if we want to go another round
if work_rx.lock().unwrap().recv().is_err() {
break;
}
let input = Some(input_factory());
barrier.wait();
execute_all(input, handle, &manager).unwrap();
barrier.wait();
}
}
})
})
.collect::<Vec<_>>();
let _jhs = JoinOnDrop(jhs);
b.iter_batched(
|| {
for _ in 0..threads {
work_tx.send(()).unwrap()
}
},
|()| {
// start the work
barrier.wait();
// wait for work to complete
barrier.wait();
},
criterion::BatchSize::PerIteration,
);
drop(work_tx);
}
}
let do_one_iteration = || {
rt.block_on(async {
barrier.wait().await;
// wait for work to complete
barrier.wait().await;
})
};
struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
b.iter_batched(
|| {
// warmup
do_one_iteration();
},
|()| {
// work loop
do_one_iteration();
},
criterion::BatchSize::PerIteration,
);
impl Drop for JoinOnDrop {
// it's not really needless because we want join all then check for panicks
#[allow(clippy::needless_collect)]
fn drop(&mut self) {
// first join all
let results = self.0.drain(..).map(|jh| jh.join()).collect::<Vec<_>>();
// then check the results; panicking here is not great, but it does get the message across
// to the user, and sets an exit value.
results.into_iter().try_for_each(|res| res).unwrap();
}
}
rt.block_on(requesters.shutdown());
fn execute_all<I>(
input: I,
handle: &tokio::runtime::Handle,
manager: &PostgresRedoManager,
) -> anyhow::Result<()>
where
I: IntoIterator<Item = Request>,
{
// just fire all requests as fast as possible
input.into_iter().try_for_each(|req| {
let page = req.execute(handle, manager)?;
assert_eq!(page.remaining(), 8192);
anyhow::Ok(())
})
}
criterion_group!(benches, redo_scenarios);
@@ -462,7 +493,11 @@ struct Request {
}
impl Request {
async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
fn execute(
self,
rt: &tokio::runtime::Handle,
manager: &PostgresRedoManager,
) -> anyhow::Result<Bytes> {
let Request {
key,
lsn,
@@ -471,8 +506,6 @@ impl Request {
pg_version,
} = self;
manager
.request_redo(key, lsn, base_img, records, pg_version)
.await
rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
}
}

View File

@@ -217,20 +217,6 @@ impl Client {
}
}
pub async fn tenant_time_travel_remote_storage(
&self,
tenant_shard_id: TenantShardId,
timestamp: &str,
done_if_after: &str,
) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}",
self.mgmt_api_endpoint
);
self.request(Method::PUT, &uri, ()).await?;
Ok(())
}
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
self.request(Method::PUT, &uri, req).await?;

View File

@@ -1,54 +0,0 @@
[package]
name = "pageserver_compaction"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
[dependencies]
anyhow.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
byteorder.workspace = true
bytes.workspace = true
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["string"] }
const_format.workspace = true
consumption_metrics.workspace = true
crossbeam-utils.workspace = true
either.workspace = true
flate2.workspace = true
fail.workspace = true
futures.workspace = true
git-version.workspace = true
hex.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
itertools.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
pin-project-lite.workspace = true
rand.workspace = true
smallvec = { workspace = true, features = ["write"] }
svg_fmt.workspace = true
sync_wrapper.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
tokio-io-timeout.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber.workspace = true
url.workspace = true
walkdir.workspace = true
metrics.workspace = true
utils.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
criterion.workspace = true
hex-literal.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

View File

@@ -1,51 +0,0 @@
# TODO
- If the key space can be perfectly partitioned at some key, perform planning on each
partition separately. For example, if we are compacting a level with layers like this:
```
:
+--+ +----+ : +------+
| | | | : | |
+--+ +----+ : +------+
:
+-----+ +-+ : +--------+
| | | | : | |
+-----+ +-+ : +--------+
:
```
At the dotted line, there is a natural split in the key space, such that all
layers are either on the left or the right of it. We can compact the
partitions separately. We could choose to create image layers for one
partition but not the other one, for example.
- All the layers don't have to be exactly the same size, we can choose to cut a
layer short or stretch it a little larger than the target size, if it helps
the overall system. We can help perfect partitions (see previous bullet point)
to happen more frequently, by choosing the cut points wisely. For example, try
to cut layers at boundaries of underlying image layers. And "snap to grid",
i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
- Avoid rewriting layers when we'd just create an identical layer to an input
layer.
- Parallelism. The code is already split up into planning and execution, so that
we first split up the compaction work into "Jobs", and then execute them.
It would be straightforward to execute multiple jobs in parallel.
- Materialize extra pages in delta layers during compaction. This would reduce
read amplification. There has been the idea of partial image layers. Materializing
extra pages in the delta layers achieve the same goal, without introducing a new
concept.
## Simulator
- Expand the simulator for more workloads
- Automate a test suite that runs the simluator with different workloads and
spits out a table of results
- Model read amplification
- More sanity checking. One idea is to keep a reference count of each
MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
a MockRecord that is newer than PITR horizon is completely dropped. That would
indicate that the record was lost.

View File

@@ -1,214 +0,0 @@
use clap::{Parser, Subcommand};
use pageserver_compaction::simulator::MockTimeline;
use rand::Rng;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::sync::OnceLock;
use utils::project_git_version;
project_git_version!(GIT_VERSION);
#[derive(Parser)]
#[command(
version = GIT_VERSION,
about = "Neon Pageserver compaction simulator",
long_about = "A developer tool to visualize and test compaction"
)]
#[command(propagate_version = true)]
struct CliOpts {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
RunSuite,
Simulate(SimulateCmd),
}
#[derive(Clone, clap::ValueEnum)]
enum Distribution {
Uniform,
HotCold,
}
/// Read and update pageserver metadata file
#[derive(Parser)]
struct SimulateCmd {
distribution: Distribution,
/// Number of records to digest
num_records: u64,
/// Record length
record_len: u64,
// Logical database size in MB
logical_size: u64,
}
async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
let mut executor = MockTimeline::new();
// Convert the logical size in MB into a key range.
let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
//let key_range = u64::MIN..u64::MAX;
println!(
"starting simulation with key range {:016X}-{:016X}",
key_range.start, key_range.end
);
// helper function to print progress indicator
let print_progress = |i| -> anyhow::Result<()> {
if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
print!(
"\ringested {} / {} records, {} MiB / {} MiB...",
i + 1,
cmd.num_records,
(i + 1) * cmd.record_len / (1_000_000),
cmd.num_records * cmd.record_len / (1_000_000),
);
std::io::stdout().flush()?;
}
Ok(())
};
match cmd.distribution {
Distribution::Uniform => {
for i in 0..cmd.num_records {
executor.ingest_uniform(1, cmd.record_len, &key_range)?;
executor.compact_if_needed().await?;
print_progress(i)?;
}
}
Distribution::HotCold => {
let splitpoint = key_range.start + (key_range.end - key_range.start) / 10;
let hot_key_range = 0..splitpoint;
let cold_key_range = splitpoint..key_range.end;
for i in 0..cmd.num_records {
let chosen_range = if rand::thread_rng().gen_bool(0.9) {
&hot_key_range
} else {
&cold_key_range
};
executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
executor.compact_if_needed().await?;
print_progress(i)?;
}
}
}
println!("done!");
executor.flush_l0();
executor.compact_if_needed().await?;
let stats = executor.stats()?;
// Print the stats to stdout, and also to a file
print!("{stats}");
std::fs::write(results_path.join("stats.txt"), stats)?;
let animation_path = results_path.join("compaction-animation.html");
executor.draw_history(std::fs::File::create(&animation_path)?)?;
println!(
"animation: file://{}",
animation_path.canonicalize()?.display()
);
Ok(())
}
async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
std::fs::create_dir(results_path)?;
set_log_file(File::create(results_path.join("log"))?);
let result = simulate(workload, results_path).await;
set_log_stdout();
result
}
async fn run_suite() -> anyhow::Result<()> {
let top_results_path = PathBuf::from(format!(
"compaction-suite-results.{}",
std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
));
std::fs::create_dir(&top_results_path)?;
let workload = SimulateCmd {
distribution: Distribution::Uniform,
// Generate 20 GB of WAL
record_len: 1_000,
num_records: 20_000_000,
// Logical size 5 GB
logical_size: 5_000,
};
run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
println!(
"All tests finished. Results in {}",
top_results_path.display()
);
Ok(())
}
use std::fs::File;
use std::io::Stdout;
use std::sync::Mutex;
use tracing_subscriber::fmt::writer::EitherWriter;
use tracing_subscriber::fmt::MakeWriter;
static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
}
fn set_log_file(f: File) {
*get_log_output().lock().unwrap() = EitherWriter::A(f);
}
fn set_log_stdout() {
*get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
}
fn init_logging() -> anyhow::Result<()> {
// We fall back to printing all spans at info-level or above if
// the RUST_LOG environment variable is not set.
let rust_log_env_filter = || {
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
};
// NB: the order of the with() calls does not matter.
// See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
use tracing_subscriber::prelude::*;
tracing_subscriber::registry()
.with({
let log_layer = tracing_subscriber::fmt::layer()
.with_target(false)
.with_ansi(false)
.with_writer(|| get_log_output().make_writer());
log_layer.with_filter(rust_log_env_filter())
})
.init();
Ok(())
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = CliOpts::parse();
init_logging()?;
match cli.command {
Commands::Simulate(cmd) => {
simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
}
Commands::RunSuite => {
run_suite().await?;
}
};
Ok(())
}

View File

@@ -1,866 +0,0 @@
//! # Tiered compaction algorithm.
//!
//! Read all the input delta files, and write a new set of delta files that
//! include all the input WAL records. See retile_deltas().
//!
//! In a "normal" LSM tree, you get to remove any values that are overwritten by
//! later values, but in our system, we keep all the history. So the reshuffling
//! doesn't remove any garbage, it just reshuffles the records to reduce read
//! amplification, i.e. the number of files that you need to access to find the
//! WAL records for a given key.
//!
//! If the new delta files would be very "narrow", i.e. each file would cover
//! only a narrow key range, then we create a new set of image files
//! instead. The current threshold is that if the estimated total size of the
//! image layers is smaller than the size of the deltas, then we create image
//! layers. That amounts to 2x storage amplification, and it means that the
//! distance of image layers in LSN dimension is roughly equal to the logical
//! database size. For example, if the logical database size is 10 GB, we would
//! generate new image layers every 10 GB of WAL.
use futures::StreamExt;
use tracing::{debug, info};
use std::collections::{HashSet, VecDeque};
use std::ops::Range;
use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
use crate::interface::*;
use utils::lsn::Lsn;
use crate::identify_levels::identify_level;
/// Main entry point to compaction.
///
/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
/// everything below that point, that needs compaction. The cutoff LSN must
/// partition the layers so that there are no layers that span across that
/// LSN. To start compaction at the top of the tree, pass the end LSN of the
/// written last L0 layer.
pub async fn compact_tiered<E: CompactionJobExecutor>(
executor: &mut E,
end_lsn: Lsn,
target_file_size: u64,
fanout: u64,
ctx: &E::RequestContext,
) -> anyhow::Result<()> {
assert!(fanout >= 2);
// Start at L0
let mut current_level_no = 0;
let mut current_level_target_height = target_file_size;
loop {
// end LSN +1 to include possible image layers exactly at 'end_lsn'.
let all_layers = executor
.get_layers(
&(E::Key::MIN..E::Key::MAX),
&(Lsn(u64::MIN)..end_lsn + 1),
ctx,
)
.await?;
info!(
"Compacting L{}, total # of layers: {}",
current_level_no,
all_layers.len()
);
// Identify the range of LSNs that belong to this level. We assume that
// each file in this level span an LSN range up to 1.75x target file
// size. That should give us enough slop that if we created a slightly
// oversized L0 layer, e.g. because flushing the in-memory layer was
// delayed for some reason, we don't consider the oversized layer to
// belong to L1. But not too much slop, that we don't accidentally
// "skip" levels.
let max_height = (current_level_target_height as f64 * 1.75) as u64;
let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
break;
};
// Calculate the height of this level. If the # of tiers exceeds the
// fanout parameter, it's time to compact it.
let depth = level.depth();
info!(
"Level {} identified as LSN range {}-{}: depth {}",
current_level_no, level.lsn_range.start, level.lsn_range.end, depth
);
for l in &level.layers {
debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
}
if depth < fanout {
debug!(
level = current_level_no,
depth = depth,
fanout,
"too few deltas to compact"
);
break;
}
compact_level(
&level.lsn_range,
&level.layers,
executor,
target_file_size,
ctx,
)
.await?;
if target_file_size == u64::MAX {
break;
}
current_level_no += 1;
current_level_target_height = current_level_target_height.saturating_mul(fanout);
}
Ok(())
}
async fn compact_level<E: CompactionJobExecutor>(
lsn_range: &Range<Lsn>,
layers: &[E::Layer],
executor: &mut E,
target_file_size: u64,
ctx: &E::RequestContext,
) -> anyhow::Result<bool> {
let mut layer_fragments = Vec::new();
for l in layers {
layer_fragments.push(LayerFragment::new(l.clone()));
}
let mut state = LevelCompactionState {
target_file_size,
_lsn_range: lsn_range.clone(),
layers: layer_fragments,
jobs: Vec::new(),
job_queue: Vec::new(),
next_level: false,
executor,
};
let first_job = CompactionJob {
key_range: E::Key::MIN..E::Key::MAX,
lsn_range: lsn_range.clone(),
strategy: CompactionStrategy::Divide,
input_layers: state
.layers
.iter()
.enumerate()
.map(|i| LayerId(i.0))
.collect(),
completed: false,
};
state.jobs.push(first_job);
state.job_queue.push(JobId(0));
state.execute(ctx).await?;
info!(
"compaction completed! Need to process next level: {}",
state.next_level
);
Ok(state.next_level)
}
/// Blackboard that keeps track of the state of all the jobs and work remaining
struct LevelCompactionState<'a, E>
where
E: CompactionJobExecutor,
{
// parameters
target_file_size: u64,
_lsn_range: Range<Lsn>,
layers: Vec<LayerFragment<E>>,
// job queue
jobs: Vec<CompactionJob<E>>,
job_queue: Vec<JobId>,
/// If false, no need to compact levels below this
next_level: bool,
/// Interface to the outside world
executor: &'a mut E,
}
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
struct LayerId(usize);
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
struct JobId(usize);
struct PendingJobSet {
pending: HashSet<JobId>,
completed: HashSet<JobId>,
}
impl PendingJobSet {
fn new() -> Self {
PendingJobSet {
pending: HashSet::new(),
completed: HashSet::new(),
}
}
fn complete_job(&mut self, job_id: JobId) {
self.pending.remove(&job_id);
self.completed.insert(job_id);
}
fn all_completed(&self) -> bool {
self.pending.is_empty()
}
}
// When we decide to rewrite a set of layers, LayerFragment is used to keep
// track which new layers supersede an old layer. When all the stakeholder jobs
// have completed, this layer can be deleted.
struct LayerFragment<E>
where
E: CompactionJobExecutor,
{
layer: E::Layer,
// If we will write new layers to replace this one, this keeps track of the
// jobs that need to complete before this layer can be deleted. As the jobs
// complete, they are moved from 'pending' to 'completed' set. Once the
// 'pending' set becomes empty, the layer can be deleted.
//
// If None, this layer is not rewritten and must not be deleted.
deletable_after: Option<PendingJobSet>,
deleted: bool,
}
impl<E> LayerFragment<E>
where
E: CompactionJobExecutor,
{
fn new(layer: E::Layer) -> Self {
LayerFragment {
layer,
deletable_after: None,
deleted: false,
}
}
}
#[derive(PartialEq)]
enum CompactionStrategy {
Divide,
CreateDelta,
CreateImage,
}
#[allow(dead_code)] // Todo
struct CompactionJob<E: CompactionJobExecutor> {
key_range: Range<E::Key>,
lsn_range: Range<Lsn>,
strategy: CompactionStrategy,
input_layers: Vec<LayerId>,
completed: bool,
}
impl<'a, E> LevelCompactionState<'a, E>
where
E: CompactionJobExecutor,
{
/// Main loop of the executor.
///
/// In each iteration, we take the next job from the queue, and execute it.
/// The execution might add new jobs to the queue. Keep going until the
/// queue is empty.
///
/// Initially, the job queue consists of one Divide job over the whole
/// level. On first call, it is divided into smaller jobs.
async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
// TODO: this would be pretty straightforward to parallelize with FuturesUnordered
while let Some(next_job_id) = self.job_queue.pop() {
info!("executing job {}", next_job_id.0);
self.execute_job(next_job_id, ctx).await?;
}
// all done!
Ok(())
}
async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
let job = &self.jobs[job_id.0];
match job.strategy {
CompactionStrategy::Divide => {
self.divide_job(job_id, ctx).await?;
Ok(())
}
CompactionStrategy::CreateDelta => {
let mut deltas: Vec<E::DeltaLayer> = Vec::new();
let mut layer_ids: Vec<LayerId> = Vec::new();
for layer_id in &job.input_layers {
let layer = &self.layers[layer_id.0].layer;
if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
deltas.push(dl.clone());
layer_ids.push(*layer_id);
}
}
self.executor
.create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
.await?;
self.jobs[job_id.0].completed = true;
// did we complete any fragments?
for layer_id in layer_ids {
let l = &mut self.layers[layer_id.0];
if let Some(deletable_after) = l.deletable_after.as_mut() {
deletable_after.complete_job(job_id);
if deletable_after.all_completed() {
self.executor.delete_layer(&l.layer, ctx).await?;
l.deleted = true;
}
}
}
self.next_level = true;
Ok(())
}
CompactionStrategy::CreateImage => {
self.executor
.create_image(job.lsn_range.end, &job.key_range, ctx)
.await?;
self.jobs[job_id.0].completed = true;
// TODO: we could check if any layers < PITR horizon became deletable
Ok(())
}
}
}
fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
let job_id = JobId(self.jobs.len());
self.jobs.push(job);
self.job_queue.push(job_id);
job_id
}
/// Take a partition of the key space, and decide how to compact it.
///
/// TODO: Currently, this is called exactly once for the level, and we
/// decide whether to create new image layers to cover the whole level, or
/// write a new set of delta. In the future, this should try to partition
/// the key space, and make the decision separately for each partition.
async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
let job = &self.jobs[job_id.0];
assert!(job.strategy == CompactionStrategy::Divide);
// Check for dummy cases
if job.input_layers.is_empty() {
return Ok(());
}
let job = &self.jobs[job_id.0];
assert!(job.strategy == CompactionStrategy::Divide);
// Would it be better to create images for this partition?
// Decide based on the average density of the level
let keyspace_size = keyspace_total_size(
&self
.executor
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
.await?,
) * 8192;
let wal_size = job
.input_layers
.iter()
.filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
.map(|layer_id| self.layers[layer_id.0].layer.file_size())
.sum::<u64>();
if keyspace_size < wal_size {
// seems worth it
info!(
"covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
);
self.cover_with_images(job_id, ctx).await
} else {
// do deltas
info!(
"coverage not worth it, keyspace_size {}, wal_size {}",
keyspace_size, wal_size
);
self.retile_deltas(job_id, ctx).await
}
}
// LSN
// ^
// |
// | ###|###|#####
// | +--+-----+--+ +--+-----+--+
// | | | | | | | | |
// | +--+--+--+--+ +--+--+--+--+
// | | | | | | |
// | +---+-+-+---+ ==> +---+-+-+---+
// | | | | | | | | |
// | +---+-+-++--+ +---+-+-++--+
// | | | | | | | | |
// | +-----+--+--+ +-----+--+--+
// |
// +--------------> key
//
async fn cover_with_images(
&mut self,
job_id: JobId,
ctx: &E::RequestContext,
) -> anyhow::Result<()> {
let job = &self.jobs[job_id.0];
assert!(job.strategy == CompactionStrategy::Divide);
// XXX: do we still need the "holes" stuff?
let mut new_jobs = Vec::new();
// Slide a window through the keyspace
let keyspace = self
.executor
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
.await?;
let mut window = KeyspaceWindow::new(
E::Key::MIN..E::Key::MAX,
keyspace,
self.target_file_size / 8192,
);
while let Some(key_range) = window.choose_next_image() {
new_jobs.push(CompactionJob::<E> {
key_range,
lsn_range: job.lsn_range.clone(),
strategy: CompactionStrategy::CreateImage,
input_layers: Vec::new(), // XXX: Is it OK for this to be empty for image layer?
completed: false,
});
}
for j in new_jobs.into_iter().rev() {
let _job_id = self.push_job(j);
// TODO: image layers don't let us delete anything. unless < PITR horizon
//let j = &self.jobs[job_id.0];
// for layer_id in j.input_layers.iter() {
// self.layers[layer_id.0].pending_stakeholders.insert(job_id);
//}
}
Ok(())
}
// Merge the contents of all the input delta layers into a new set
// of delta layers, based on the current partitioning.
//
// We split the new delta layers on the key dimension. We iterate through
// the key space, and for each key, check if including the next key to the
// current output layer we're building would cause the layer to become too
// large. If so, dump the current output layer and start new one. It's
// possible that there is a single key with so many page versions that
// storing all of them in a single layer file would be too large. In that
// case, we also split on the LSN dimension.
//
// LSN
// ^
// |
// | +-----------+ +--+--+--+--+
// | | | | | | | |
// | +-----------+ | | | | |
// | | | | | | | |
// | +-----------+ ==> | | | | |
// | | | | | | | |
// | +-----------+ | | | | |
// | | | | | | | |
// | +-----------+ +--+--+--+--+
// |
// +--------------> key
//
//
// If one key (X) has a lot of page versions:
//
// LSN
// ^
// | (X)
// | +-----------+ +--+--+--+--+
// | | | | | | | |
// | +-----------+ | | +--+ |
// | | | | | | | |
// | +-----------+ ==> | | | | |
// | | | | | +--+ |
// | +-----------+ | | | | |
// | | | | | | | |
// | +-----------+ +--+--+--+--+
// |
// +--------------> key
//
// TODO: this actually divides the layers into fixed-size chunks, not
// based on the partitioning.
//
// TODO: we should also opportunistically materialize and
// garbage collect what we can.
async fn retile_deltas(
&mut self,
job_id: JobId,
ctx: &E::RequestContext,
) -> anyhow::Result<()> {
let job = &self.jobs[job_id.0];
assert!(job.strategy == CompactionStrategy::Divide);
// Sweep the key space left to right, running an estimate of how much
// disk size and keyspace we have accumulated
//
// Once the disk size reaches the target threshold, stop and think.
// If we have accumulated only a narrow band of keyspace, create an
// image layer. Otherwise write a delta layer.
// FIXME: deal with the case of lots of values for same key
// FIXME: we are ignoring images here. Did we already divide the work
// so that we won't encounter them here?
let mut deltas: Vec<E::DeltaLayer> = Vec::new();
for layer_id in &job.input_layers {
let l = &self.layers[layer_id.0];
if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
deltas.push(dl.clone());
}
}
// Open stream
let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
let mut new_jobs = Vec::new();
// Slide a window through the keyspace
let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
let mut all_in_window: bool = false;
let mut window = Window::new();
loop {
if all_in_window && window.elems.is_empty() {
// All done!
break;
}
if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
{
let batch_layers: Vec<LayerId> = job
.input_layers
.iter()
.filter(|layer_id| {
overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
})
.cloned()
.collect();
assert!(!batch_layers.is_empty());
new_jobs.push(CompactionJob {
key_range,
lsn_range: job.lsn_range.clone(),
strategy: CompactionStrategy::CreateDelta,
input_layers: batch_layers,
completed: false,
});
} else {
assert!(!all_in_window);
if let Some(next_key) = key_accum.next().await.transpose()? {
window.feed(next_key.key, next_key.size);
} else {
all_in_window = true;
}
}
}
// All the input files are rewritten. Set up the tracking for when they can
// be deleted.
for layer_id in job.input_layers.iter() {
let l = &mut self.layers[layer_id.0];
assert!(l.deletable_after.is_none());
l.deletable_after = Some(PendingJobSet::new());
}
for j in new_jobs.into_iter().rev() {
let job_id = self.push_job(j);
let j = &self.jobs[job_id.0];
for layer_id in j.input_layers.iter() {
self.layers[layer_id.0]
.deletable_after
.as_mut()
.unwrap()
.pending
.insert(job_id);
}
}
Ok(())
}
}
// Sliding window through keyspace and values
// This is used by over_with_images to decide on good split points
struct KeyspaceWindow<K> {
head: KeyspaceWindowHead<K>,
start_pos: KeyspaceWindowPos<K>,
}
struct KeyspaceWindowHead<K> {
// overall key range to cover
key_range: Range<K>,
keyspace: Vec<Range<K>>,
target_keysize: u64,
}
#[derive(Clone)]
struct KeyspaceWindowPos<K> {
end_key: K,
keyspace_idx: usize,
accum_keysize: u64,
}
impl<K: CompactionKey> KeyspaceWindowPos<K> {
fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
self.keyspace_idx == w.keyspace.len()
}
// Advance the cursor until it reaches 'target_keysize'.
fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
while self.accum_keysize < max_size && !self.reached_end(w) {
let curr_range = &w.keyspace[self.keyspace_idx];
if self.end_key < curr_range.start {
// skip over any unused space
self.end_key = curr_range.start;
}
// We're now within 'curr_range'. Can we advance past it completely?
let distance = K::key_range_size(&(self.end_key..curr_range.end));
if (self.accum_keysize + distance as u64) < max_size {
// oh yeah, it fits
self.end_key = curr_range.end;
self.keyspace_idx += 1;
self.accum_keysize += distance as u64;
} else {
// advance within the range
let skip_key = self.end_key.skip_some();
let distance = K::key_range_size(&(self.end_key..skip_key));
if (self.accum_keysize + distance as u64) < max_size {
self.end_key = skip_key;
self.accum_keysize += distance as u64;
} else {
self.end_key = self.end_key.next();
self.accum_keysize += 1;
}
}
}
}
}
impl<K> KeyspaceWindow<K>
where
K: CompactionKey,
{
fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
assert!(keyspace.first().unwrap().start >= key_range.start);
let start_key = key_range.start;
let start_pos = KeyspaceWindowPos::<K> {
end_key: start_key,
keyspace_idx: 0,
accum_keysize: 0,
};
Self {
head: KeyspaceWindowHead::<K> {
key_range,
keyspace,
target_keysize,
},
start_pos,
}
}
fn choose_next_image(&mut self) -> Option<Range<K>> {
if self.start_pos.keyspace_idx == self.head.keyspace.len() {
// we've reached the end
return None;
}
let mut next_pos = self.start_pos.clone();
next_pos.advance_until_size(
&self.head,
self.start_pos.accum_keysize + self.head.target_keysize,
);
// See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
// 1.25x target size
let mut end_pos = next_pos.clone();
end_pos.advance_until_size(
&self.head,
self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
);
if end_pos.reached_end(&self.head) {
// gobble up any unused keyspace between the last used key and end of the range
assert!(end_pos.end_key <= self.head.key_range.end);
end_pos.end_key = self.head.key_range.end;
next_pos = end_pos;
}
let start_key = self.start_pos.end_key;
self.start_pos = next_pos;
Some(start_key..self.start_pos.end_key)
}
}
// Sliding window through keyspace and values
//
// This is used to decide what layer to write next, from the beginning of the window.
//
// Candidates:
//
// 1. Create an image layer, snapping to previous images
// 2. Create a delta layer, snapping to previous images
// 3. Create an image layer, snapping to
//
//
// Take previous partitioning, based on the image layers below.
//
// Candidate is at the front:
//
// Consider stretching an image layer to next divider? If it's close enough,
// that's the image candidate
//
// If it's too far, consider splitting at a reasonable point
//
// Is the image candidate smaller than the equivalent delta? If so,
// split off the image. Otherwise, split off one delta.
// Try to snap off the delta at a reasonable point
struct WindowElement<K> {
start_key: K, // inclusive
last_key: K, // inclusive
accum_size: u64,
}
struct Window<K> {
elems: VecDeque<WindowElement<K>>,
// last key that was split off, inclusive
splitoff_key: Option<K>,
splitoff_size: u64,
}
impl<K> Window<K>
where
K: CompactionKey,
{
fn new() -> Self {
Self {
elems: VecDeque::new(),
splitoff_key: None,
splitoff_size: 0,
}
}
fn feed(&mut self, key: K, size: u64) {
let last_size;
if let Some(last) = self.elems.back_mut() {
assert!(last.last_key <= key);
if key == last.last_key {
last.accum_size += size;
return;
}
last_size = last.accum_size;
} else {
last_size = 0;
}
// This is a new key.
let elem = WindowElement {
start_key: key,
last_key: key,
accum_size: last_size + size,
};
self.elems.push_back(elem);
}
fn remain_size(&self) -> u64 {
self.elems.back().unwrap().accum_size - self.splitoff_size
}
fn peek_size(&self) -> u64 {
self.elems.front().unwrap().accum_size - self.splitoff_size
}
fn commit_upto(&mut self, mut upto: usize) {
while upto > 1 {
let popped = self.elems.pop_front().unwrap();
self.elems.front_mut().unwrap().start_key = popped.start_key;
upto -= 1;
}
}
fn find_size_split(&self, target_size: u64) -> usize {
self.elems
.partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
}
fn pop(&mut self) {
let first = self.elems.pop_front().unwrap();
self.splitoff_size = first.accum_size;
self.splitoff_key = Some(first.last_key);
}
// the difference between delta and image is that an image covers
// any unused keyspace before and after, while a delta tries to
// minimize that. TODO: difference not implemented
fn pop_delta(&mut self) -> Range<K> {
let first = self.elems.front().unwrap();
let key_range = first.start_key..first.last_key.next();
self.pop();
key_range
}
// Prerequisite: we have enough input in the window
//
// On return None, the caller should feed more data and call again
fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
if has_more && self.elems.is_empty() {
// Starting up
return None;
}
// If we still have an undersized candidate, just keep going
while self.peek_size() < target_size {
if self.elems.len() > 1 {
self.commit_upto(2);
} else if has_more {
return None;
} else {
break;
}
}
// Ensure we have enough input in the window to make a good decision
if has_more && self.remain_size() < target_size * 5 / 4 {
return None;
}
// The candidate on the front is now large enough, for a delta.
// And we have enough data in the window to decide.
// If we're willing to stretch it up to 1.25 target size, could we
// gobble up the rest of the work? This avoids creating very small
// "tail" layers at the end of the keyspace
if !has_more && self.remain_size() < target_size * 5 / 3 {
self.commit_upto(self.elems.len());
} else {
let delta_split_at = self.find_size_split(target_size);
self.commit_upto(delta_split_at);
// If it's still not large enough, request the caller to fill the window
if self.elems.len() == 1 && has_more {
return None;
}
}
Some(self.pop_delta())
}
}

View File

@@ -1,243 +0,0 @@
//! This file contains generic utility functions over the interface types,
//! which could be handy for any compaction implementation.
use crate::interface::*;
use futures::future::BoxFuture;
use futures::{Stream, StreamExt};
use itertools::Itertools;
use pin_project_lite::pin_project;
use std::cmp::Ord;
use std::collections::BinaryHeap;
use std::collections::VecDeque;
use std::future::Future;
use std::ops::{DerefMut, Range};
use std::pin::Pin;
use std::task::{ready, Poll};
pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
where
K: CompactionKey,
{
keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
}
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
!(a.end <= b.start || b.end <= a.start)
}
pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
let x = std::mem::take(a);
let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
.into_iter()
.kmerge_by(|a, b| a.start < b.start);
let mut ranges = Vec::new();
if let Some(first) = all_ranges_iter.next() {
let (mut start, mut end) = (first.start, first.end);
for r in all_ranges_iter {
assert!(r.start >= start);
if r.start > end {
ranges.push(start..end);
start = r.start;
end = r.end;
} else if r.end > end {
end = r.end;
}
}
ranges.push(start..end);
}
*a = ranges
}
pub fn intersect_keyspace<K: Ord + Clone + Copy>(
a: &CompactionKeySpace<K>,
r: &Range<K>,
) -> CompactionKeySpace<K> {
let mut ranges: Vec<Range<K>> = Vec::new();
for x in a.iter() {
if x.end <= r.start {
continue;
}
if x.start >= r.end {
break;
}
ranges.push(x.clone())
}
// trim the ends
if let Some(first) = ranges.first_mut() {
first.start = std::cmp::max(first.start, r.start);
}
if let Some(last) = ranges.last_mut() {
last.end = std::cmp::min(last.end, r.end);
}
ranges
}
/// Create a stream that iterates through all DeltaEntrys among all input
/// layers, in key-lsn order.
///
/// This is public because the create_delta() implementation likely wants to use this too
/// TODO: move to a more shared place
pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
layers: &'a [E::DeltaLayer],
ctx: &'a E::RequestContext,
) -> MergeDeltaKeys<'a, E> {
// Use a binary heap to merge the layers. Each input layer is initially
// represented by a LazyLoadLayer::Unloaded element, which uses the start of
// the layer's key range as the key. The first time a layer reaches the top
// of the heap, all the keys of the layer are loaded into a sorted vector.
//
// This helps to keep the memory usage reasonable: we only need to hold in
// memory the DeltaEntrys of the layers that overlap with the "current" key.
let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
for l in layers {
heap.push(LazyLoadLayer::Unloaded(l));
}
MergeDeltaKeys {
heap,
ctx,
load_future: None,
}
}
enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
Unloaded(&'a E::DeltaLayer),
}
impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
fn key(&self) -> E::Key {
match self {
Self::Loaded(entries) => entries.front().unwrap().key(),
Self::Unloaded(dl) => dl.key_range().start,
}
}
}
impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
// reverse order so that we get a min-heap
other.key().cmp(&self.key())
}
}
impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
fn eq(&self, other: &Self) -> bool {
self.key().eq(&other.key())
}
}
impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
// Stream returned by `merge_delta_keys`
pin_project! {
#[allow(clippy::type_complexity)]
pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
heap: BinaryHeap<LazyLoadLayer<'a, E>>,
#[pin]
load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
ctx: &'a E::RequestContext,
}
}
impl<'a, E> Stream for MergeDeltaKeys<'a, E>
where
E: CompactionJobExecutor + 'a,
{
type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
fn poll_next(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
let mut this = self.project();
loop {
if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
// We are waiting for loading the keys to finish
match ready!(load_future.as_mut().poll(cx)) {
Ok(entries) => {
this.load_future.set(None);
*this.heap.peek_mut().unwrap() =
LazyLoadLayer::Loaded(VecDeque::from(entries));
}
Err(e) => {
return Poll::Ready(Some(Err(e)));
}
}
}
// If the topmost layer in the heap hasn't been loaded yet, start
// loading it. Otherwise return the next entry from it and update
// the layer's position in the heap (this decreaseKey operation is
// performed implicitly when `top` is dropped).
if let Some(mut top) = this.heap.peek_mut() {
match top.deref_mut() {
LazyLoadLayer::Unloaded(ref mut l) => {
let fut = l.load_keys(this.ctx);
this.load_future.set(Some(fut));
continue;
}
LazyLoadLayer::Loaded(ref mut entries) => {
let result = entries.pop_front().unwrap();
if entries.is_empty() {
std::collections::binary_heap::PeekMut::pop(top);
}
return Poll::Ready(Some(Ok(result)));
}
}
} else {
return Poll::Ready(None);
}
}
}
}
// Accumulate values at key boundaries
pub struct KeySize<K> {
pub key: K,
pub num_values: u64,
pub size: u64,
}
pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
where
K: Eq,
I: Stream<Item = Result<D, E>>,
D: CompactionDeltaEntry<'a, K>,
{
async_stream::try_stream! {
// Initialize the state from the first value
let mut input = std::pin::pin!(input);
if let Some(first) = input.next().await {
let first = first?;
let mut accum: KeySize<K> = KeySize {
key: first.key(),
num_values: 1,
size: first.size(),
};
while let Some(this) = input.next().await {
let this = this?;
if this.key() == accum.key {
accum.size += this.size();
accum.num_values += 1;
} else {
yield accum;
accum = KeySize {
key: this.key(),
num_values: 1,
size: this.size(),
};
}
}
yield accum;
}
}
}

View File

@@ -1,376 +0,0 @@
//! An LSM tree consists of multiple levels, each exponential larger than the
//! previous level. And each level consists of be multiple "tiers". With tiered
//! compaction, a level is compacted when it has accumulated more than N tiers,
//! forming one tier on the next level.
//!
//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
//! we identify them by looking at the shapes of the layers. It's an easy task
//! for a human, but it's not straightforward to come up with the exact
//! rules. Especially if there are cases like interrupted, half-finished
//! compactions, or highly skewed data distributions that have let us "skip"
//! some levels. It's not critical to classify all cases correctly; at worst we
//! delay some compaction work, and suffer from more read amplification, or we
//! perform some unnecessary compaction work.
//!
//! `identify_level` performs that shape-matching.
//!
//! It returns a Level struct, which has `depth()` function to count the number
//! of "tiers" in the level. The tier count is the max depth of stacked layers
//! within the level. That's a good measure, because the point of compacting is
//! to reduce read amplification, and the depth is what determines that.
//!
//! One interesting effect of this is that if we generate very small delta
//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
//! because they reach the target size, the L0 compaction will combine them to
//! one larger file. But if the combined file is still smaller than the target
//! file size, the file will still be considered to be part of L0 at the next
//! iteration.
use anyhow::bail;
use std::collections::BTreeSet;
use std::ops::Range;
use utils::lsn::Lsn;
use crate::interface::*;
use tracing::{info, trace};
pub struct Level<L> {
pub lsn_range: Range<Lsn>,
pub layers: Vec<L>,
}
/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
/// no layers that cross the boundary LSN.
///
/// A further restriction is that all layers in the returned partition cover at
/// most 'lsn_max_size' LSN bytes.
pub async fn identify_level<K, L>(
all_layers: Vec<L>,
end_lsn: Lsn,
lsn_max_size: u64,
) -> anyhow::Result<Option<Level<L>>>
where
K: CompactionKey,
L: CompactionLayer<K> + Clone,
{
// filter out layers that are above the `end_lsn`, they are completely irrelevant.
let mut layers = Vec::new();
for l in all_layers {
if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
// shouldn't happen. Indicates that the caller passed a bogus
// end_lsn.
bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
}
// include image layers sitting exacty at `end_lsn`.
let is_image = !l.is_delta();
if (is_image && l.lsn_range().start > end_lsn)
|| (!is_image && l.lsn_range().start >= end_lsn)
{
continue;
}
layers.push(l);
}
// All the remaining layers either belong to this level, or are below it.
info!(
"identify level at {}, size {}, num layers below: {}",
end_lsn,
lsn_max_size,
layers.len()
);
if layers.is_empty() {
return Ok(None);
}
// Walk the ranges in LSN order.
//
// ----- end_lsn
// |
// |
// v
//
layers.sort_by_key(|l| l.lsn_range().end);
let mut candidate_start_lsn = end_lsn;
let mut candidate_layers: Vec<L> = Vec::new();
let mut current_best_start_lsn = end_lsn;
let mut current_best_layers: Vec<L> = Vec::new();
let mut iter = layers.into_iter();
loop {
let Some(l) = iter.next_back() else {
// Reached end. Accept the last candidate
current_best_start_lsn = candidate_start_lsn;
current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
break;
};
trace!(
"inspecting {} for candidate {}, current best {}",
l.short_id(),
candidate_start_lsn,
current_best_start_lsn
);
let r = l.lsn_range();
// Image layers don't restrict our choice of cutoff LSN
if l.is_delta() {
// Is this candidate workable? In other words, are there any
// delta layers that span across this LSN
//
// Valid: Not valid:
// + +
// | | +
// + <- candidate + | <- candidate
// + +
// |
// +
if r.end <= candidate_start_lsn {
// Hooray, there are no crossing LSNs. And we have visited
// through all the layers within candidate..end_lsn. The
// current candidate can be accepted.
current_best_start_lsn = r.end;
current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
candidate_start_lsn = r.start;
}
// Is it small enough to be considered part of this level?
if r.end.0 - r.start.0 > lsn_max_size {
// Too large, this layer belongs to next level. Stop.
trace!(
"too large {}, size {} vs {}",
l.short_id(),
r.end.0 - r.start.0,
lsn_max_size
);
break;
}
// If this crosses the candidate lsn, push it down.
if r.start < candidate_start_lsn {
trace!(
"layer {} prevents from stopping at {}",
l.short_id(),
candidate_start_lsn
);
candidate_start_lsn = r.start;
}
}
// Include this layer in our candidate
candidate_layers.push(l);
}
Ok(if current_best_start_lsn == end_lsn {
// empty level
None
} else {
Some(Level {
lsn_range: current_best_start_lsn..end_lsn,
layers: current_best_layers,
})
})
}
// helper struct used in depth()
struct Event<K> {
key: K,
layer_idx: usize,
start: bool,
}
impl<L> Level<L> {
/// Count the number of deltas stacked on each other.
pub fn depth<K>(&self) -> u64
where
K: CompactionKey,
L: CompactionLayer<K>,
{
let mut events: Vec<Event<K>> = Vec::new();
for (idx, l) in self.layers.iter().enumerate() {
events.push(Event {
key: l.key_range().start,
layer_idx: idx,
start: true,
});
events.push(Event {
key: l.key_range().end,
layer_idx: idx,
start: false,
});
}
events.sort_by_key(|e| (e.key, e.start));
// Sweep the key space left to right. Stop at each distinct key, and
// count the number of deltas on top of the highest image at that key.
//
// This is a little enefficient, as we walk through the active_set on
// every key. We could increment/decrement a counter on each step
// instead, but that'd require a bit more complex bookkeeping.
let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
let mut max_depth = 0;
let mut events_iter = events.iter().peekable();
while let Some(e) = events_iter.next() {
let l = &self.layers[e.layer_idx];
let is_image = !l.is_delta();
// update the active set
if e.start {
active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
} else {
active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
}
// recalculate depth if this was the last event at this point
let more_events_at_this_key = events_iter
.peek()
.map_or(false, |next_e| next_e.key == e.key);
if !more_events_at_this_key {
let mut active_depth = 0;
for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
if *is_image {
break;
}
active_depth += 1;
}
if active_depth > max_depth {
max_depth = active_depth;
}
}
}
max_depth
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
use std::sync::{Arc, Mutex};
fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
MockLayer::Delta(Arc::new(MockDeltaLayer {
key_range,
lsn_range,
// identify_level() doesn't pay attention to the rest of the fields
file_size: 0,
deleted: Mutex::new(false),
records: vec![],
}))
}
fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
MockLayer::Image(Arc::new(MockImageLayer {
key_range,
lsn_range: lsn..(lsn + 1),
// identify_level() doesn't pay attention to the rest of the fields
file_size: 0,
deleted: Mutex::new(false),
}))
}
#[tokio::test]
async fn test_identify_level() -> anyhow::Result<()> {
let layers = vec![
delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
];
// All layers fit in the max file size
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
.await?
.unwrap();
assert_eq!(level.depth(), 6);
// Same LSN with smaller max file size. The second layer from the top is larger
// and belongs to next level.
let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
.await?
.unwrap();
assert_eq!(level.depth(), 1);
// Call with a smaller LSN
let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
.await?
.unwrap();
assert_eq!(level.depth(), 2);
// Call with an LSN that doesn't partition the space
let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
assert!(result.is_err());
Ok(())
}
#[tokio::test]
async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
// The files LSN ranges overlap, so even though there are more files that
// fit under the file size, they are not included in the level because they
// overlap so that we'd need to include the oldest file, too, which is
// larger
let layers = vec![
delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
];
let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
.await?
.unwrap();
assert_eq!(level.depth(), 1);
Ok(())
}
#[tokio::test]
async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
// The key ranges don't overlap, so depth is only 1.
let layers = vec![
delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
];
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
.await?
.unwrap();
assert_eq!(level.layers.len(), 3);
assert_eq!(level.depth(), 1);
// Staggered. The 1st and 3rd layer don't overlap with each other.
let layers = vec![
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
];
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
.await?
.unwrap();
assert_eq!(level.layers.len(), 3);
assert_eq!(level.depth(), 2);
Ok(())
}
#[tokio::test]
async fn test_depth_images() -> anyhow::Result<()> {
let layers: Vec<MockLayer> = vec![
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
// This covers the same key range as the 2nd delta layer. The depth
// in that key range is therefore 0.
image(1500..2500, Lsn(0x9000)),
];
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
.await?
.unwrap();
assert_eq!(level.layers.len(), 4);
assert_eq!(level.depth(), 1);
Ok(())
}
}

View File

@@ -1,167 +0,0 @@
//! This is what the compaction implementation needs to know about
//! layers, keyspace etc.
//!
//! All the heavy lifting is done by the create_image and create_delta
//! functions that the implementor provides.
use async_trait::async_trait;
use pageserver_api::{key::Key, keyspace::key_range_size};
use std::ops::Range;
use utils::lsn::Lsn;
/// Public interface. This is the main thing that the implementor needs to provide
#[async_trait]
pub trait CompactionJobExecutor {
// Type system.
//
// We assume that there are two kinds of layers, deltas and images. The
// compaction doesn't distinguish whether they are stored locally or
// remotely.
//
// The keyspace is defined by CompactionKey trait.
//
type Key: CompactionKey;
type Layer: CompactionLayer<Self::Key> + Clone;
type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
type ImageLayer: CompactionImageLayer<Self> + Clone;
// This is passed through to all the interface functions. The compaction
// implementation doesn't do anything with it, but it might be useful for
// the interface implementation.
type RequestContext: CompactionRequestContext;
// ----
// Functions that the planner uses to support its decisions
// ----
/// Return all layers that overlap the given bounding box.
async fn get_layers(
&mut self,
key_range: &Range<Self::Key>,
lsn_range: &Range<Lsn>,
ctx: &Self::RequestContext,
) -> anyhow::Result<Vec<Self::Layer>>;
async fn get_keyspace(
&mut self,
key_range: &Range<Self::Key>,
lsn: Lsn,
ctx: &Self::RequestContext,
) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
/// NB: This is a pretty expensive operation. In the real pageserver
/// implementation, it downloads the layer, and keeps it resident
/// until the DeltaLayer is dropped.
async fn downcast_delta_layer(
&self,
layer: &Self::Layer,
) -> anyhow::Result<Option<Self::DeltaLayer>>;
// ----
// Functions to execute the plan
// ----
/// Create a new image layer, materializing all the values in the key range,
/// at given 'lsn'.
async fn create_image(
&mut self,
lsn: Lsn,
key_range: &Range<Self::Key>,
ctx: &Self::RequestContext,
) -> anyhow::Result<()>;
/// Create a new delta layer, containing all the values from 'input_layers'
/// in the given key and LSN range.
async fn create_delta(
&mut self,
lsn_range: &Range<Lsn>,
key_range: &Range<Self::Key>,
input_layers: &[Self::DeltaLayer],
ctx: &Self::RequestContext,
) -> anyhow::Result<()>;
/// Delete a layer. The compaction implementation will call this only after
/// all the create_image() or create_delta() calls that deletion of this
/// layer depends on have finished. But if the implementor has extra lazy
/// background tasks, like uploading the index json file to remote storage,
/// it is the implementation's responsibility to track those.
async fn delete_layer(
&mut self,
layer: &Self::Layer,
ctx: &Self::RequestContext,
) -> anyhow::Result<()>;
}
pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
const MIN: Self;
const MAX: Self;
/// Calculate distance between key_range.start and key_range.end.
///
/// This returns u32, for compatibility with Repository::key. If the
/// distance is larger, return u32::MAX.
fn key_range_size(key_range: &Range<Self>) -> u32;
// return "self + 1"
fn next(&self) -> Self;
// return "self + <some decent amount to skip>". The amount to skip
// is left to the implementation.
// FIXME: why not just "add(u32)" ? This is hard to use
fn skip_some(&self) -> Self;
}
impl CompactionKey for Key {
const MIN: Self = Self::MIN;
const MAX: Self = Self::MAX;
fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
key_range_size(r)
}
fn next(&self) -> Key {
(self as &Key).next()
}
fn skip_some(&self) -> Key {
self.add(128)
}
}
/// Contiguous ranges of keys that belong to the key space. In key order, and
/// with no overlap.
pub type CompactionKeySpace<K> = Vec<Range<K>>;
/// Functions needed from all layers.
pub trait CompactionLayer<K: CompactionKey + ?Sized> {
fn key_range(&self) -> &Range<K>;
fn lsn_range(&self) -> &Range<Lsn>;
fn file_size(&self) -> u64;
/// For debugging, short human-readable representation of the layer. E.g. filename.
fn short_id(&self) -> String;
fn is_delta(&self) -> bool;
}
#[async_trait]
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
where
Self: 'a;
/// Return all keys in this delta layer.
async fn load_keys<'a>(
&self,
ctx: &E::RequestContext,
) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
}
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
pub trait CompactionDeltaEntry<'a, K> {
fn key(&self) -> K;
fn lsn(&self) -> Lsn;
fn size(&self) -> u64;
}
pub trait CompactionRequestContext {}

View File

@@ -1,12 +0,0 @@
// The main module implementing the compaction algorithm
pub mod compact_tiered;
pub(crate) mod identify_levels;
// Traits that the caller of the compaction needs to implement
pub mod interface;
// Utility functions, useful for the implementation
pub mod helpers;
// A simulator with mock implementations of 'interface'
pub mod simulator;

View File

@@ -1,613 +0,0 @@
mod draw;
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
use async_trait::async_trait;
use futures::StreamExt;
use rand::Rng;
use tracing::info;
use utils::lsn::Lsn;
use std::fmt::Write;
use std::ops::Range;
use std::sync::Arc;
use std::sync::Mutex;
use crate::helpers::{merge_delta_keys, overlaps_with};
use crate::interface;
use crate::interface::CompactionLayer;
//
// Implementation for the CompactionExecutor interface
//
pub struct MockTimeline {
// Parameters for the compaction algorithm
pub target_file_size: u64,
tiers_per_level: u64,
num_l0_flushes: u64,
last_compact_at_flush: u64,
last_flush_lsn: Lsn,
// In-memory layer
records: Vec<MockRecord>,
total_len: u64,
start_lsn: Lsn,
end_lsn: Lsn,
// Current keyspace at `end_lsn`. This is updated on every ingested record.
keyspace: KeySpace,
// historic keyspaces
old_keyspaces: Vec<(Lsn, KeySpace)>,
// "on-disk" layers
pub live_layers: Vec<MockLayer>,
num_deleted_layers: u64,
// Statistics
wal_ingested: u64,
bytes_written: u64,
bytes_deleted: u64,
layers_created: u64,
layers_deleted: u64,
// All the events - creation and deletion of files - are collected
// in 'history'. It is used to draw the SVG animation at the end.
time: u64,
history: Vec<draw::LayerTraceEvent>,
}
type KeySpace = interface::CompactionKeySpace<Key>;
pub struct MockRequestContext {}
impl interface::CompactionRequestContext for MockRequestContext {}
pub type Key = u64;
impl interface::CompactionKey for Key {
const MIN: Self = u64::MIN;
const MAX: Self = u64::MAX;
fn key_range_size(key_range: &Range<Self>) -> u32 {
std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
}
fn next(&self) -> Self {
self + 1
}
fn skip_some(&self) -> Self {
// round up to next xx
self + 100
}
}
#[derive(Clone)]
pub struct MockRecord {
lsn: Lsn,
key: Key,
len: u64,
}
impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
fn key(&self) -> Key {
self.key
}
fn lsn(&self) -> Lsn {
self.lsn
}
fn size(&self) -> u64 {
self.len
}
}
pub struct MockDeltaLayer {
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
pub file_size: u64,
pub deleted: Mutex<bool>,
pub records: Vec<MockRecord>,
}
impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
fn key_range(&self) -> &Range<Key> {
&self.key_range
}
fn lsn_range(&self) -> &Range<Lsn> {
&self.lsn_range
}
fn file_size(&self) -> u64 {
self.file_size
}
fn short_id(&self) -> String {
format!(
"{:016X}-{:016X}__{:08X}-{:08X}",
self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
)
}
fn is_delta(&self) -> bool {
true
}
}
#[async_trait]
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
type DeltaEntry<'a> = MockRecord;
async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
Ok(self.records.clone())
}
}
pub struct MockImageLayer {
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
pub file_size: u64,
pub deleted: Mutex<bool>,
}
impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
fn key_range(&self) -> &Range<Key> {
&self.key_range
}
fn lsn_range(&self) -> &Range<Lsn> {
&self.lsn_range
}
fn file_size(&self) -> u64 {
self.file_size
}
fn short_id(&self) -> String {
format!(
"{:016X}-{:016X}__{:08X}",
self.key_range.start, self.key_range.end, self.lsn_range.start.0,
)
}
fn is_delta(&self) -> bool {
false
}
}
impl MockTimeline {
pub fn new() -> Self {
MockTimeline {
target_file_size: 256 * 1024 * 1024,
tiers_per_level: 4,
num_l0_flushes: 0,
last_compact_at_flush: 0,
last_flush_lsn: Lsn(0),
records: Vec::new(),
total_len: 0,
start_lsn: Lsn(1000),
end_lsn: Lsn(1000),
keyspace: KeySpace::new(),
old_keyspaces: vec![],
live_layers: vec![],
num_deleted_layers: 0,
wal_ingested: 0,
bytes_written: 0,
bytes_deleted: 0,
layers_created: 0,
layers_deleted: 0,
time: 0,
history: Vec::new(),
}
}
pub async fn compact(&mut self) -> anyhow::Result<()> {
let ctx = MockRequestContext {};
crate::compact_tiered::compact_tiered(
self,
self.last_flush_lsn,
self.target_file_size,
self.tiers_per_level,
&ctx,
)
.await?;
Ok(())
}
// Ingest one record to the timeline
pub fn ingest_record(&mut self, key: Key, len: u64) {
self.records.push(MockRecord {
lsn: self.end_lsn,
key,
len,
});
self.total_len += len;
self.end_lsn += len;
if self.total_len > self.target_file_size {
self.flush_l0();
}
}
pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
self.compact().await?;
self.last_compact_at_flush = self.num_l0_flushes;
}
Ok(())
}
pub fn flush_l0(&mut self) {
if self.records.is_empty() {
return;
}
let mut records = std::mem::take(&mut self.records);
records.sort_by_key(|rec| rec.key);
let lsn_range = self.start_lsn..self.end_lsn;
let new_layer = Arc::new(MockDeltaLayer {
key_range: Key::MIN..Key::MAX,
lsn_range: lsn_range.clone(),
file_size: self.total_len,
records,
deleted: Mutex::new(false),
});
info!("flushed L0 layer {}", new_layer.short_id());
self.live_layers.push(MockLayer::from(&new_layer));
// reset L0
self.start_lsn = self.end_lsn;
self.total_len = 0;
self.records = Vec::new();
self.layers_created += 1;
self.bytes_written += new_layer.file_size;
self.time += 1;
self.history.push(LayerTraceEvent {
time_rel: self.time,
op: LayerTraceOp::Flush,
file: LayerTraceFile {
filename: new_layer.short_id(),
key_range: new_layer.key_range.clone(),
lsn_range: new_layer.lsn_range.clone(),
},
});
self.num_l0_flushes += 1;
self.last_flush_lsn = self.end_lsn;
}
// Ingest `num_records' records to the timeline, with random keys
// uniformly distributed in `key_range`
pub fn ingest_uniform(
&mut self,
num_records: u64,
len: u64,
key_range: &Range<Key>,
) -> anyhow::Result<()> {
crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
let mut rng = rand::thread_rng();
for _ in 0..num_records {
self.ingest_record(rng.gen_range(key_range.clone()), len);
self.wal_ingested += len;
}
Ok(())
}
pub fn stats(&self) -> anyhow::Result<String> {
let mut s = String::new();
writeln!(s, "STATISTICS:")?;
writeln!(
s,
"WAL ingested: {:>10} MB",
self.wal_ingested / (1024 * 1024)
)?;
writeln!(
s,
"size created: {:>10} MB",
self.bytes_written / (1024 * 1024)
)?;
writeln!(
s,
"size deleted: {:>10} MB",
self.bytes_deleted / (1024 * 1024)
)?;
writeln!(s, "files created: {:>10}", self.layers_created)?;
writeln!(s, "files deleted: {:>10}", self.layers_deleted)?;
writeln!(
s,
"write amp: {:>10.2}",
self.bytes_written as f64 / self.wal_ingested as f64
)?;
writeln!(
s,
"storage amp: {:>10.2}",
(self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
)?;
Ok(s)
}
pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
draw::draw_history(&self.history, output)
}
}
impl Default for MockTimeline {
fn default() -> Self {
Self::new()
}
}
#[derive(Clone)]
pub enum MockLayer {
Delta(Arc<MockDeltaLayer>),
Image(Arc<MockImageLayer>),
}
impl interface::CompactionLayer<Key> for MockLayer {
fn key_range(&self) -> &Range<Key> {
match self {
MockLayer::Delta(this) => this.key_range(),
MockLayer::Image(this) => this.key_range(),
}
}
fn lsn_range(&self) -> &Range<Lsn> {
match self {
MockLayer::Delta(this) => this.lsn_range(),
MockLayer::Image(this) => this.lsn_range(),
}
}
fn file_size(&self) -> u64 {
match self {
MockLayer::Delta(this) => this.file_size(),
MockLayer::Image(this) => this.file_size(),
}
}
fn short_id(&self) -> String {
match self {
MockLayer::Delta(this) => this.short_id(),
MockLayer::Image(this) => this.short_id(),
}
}
fn is_delta(&self) -> bool {
match self {
MockLayer::Delta(_) => true,
MockLayer::Image(_) => false,
}
}
}
impl MockLayer {
fn is_deleted(&self) -> bool {
let guard = match self {
MockLayer::Delta(this) => this.deleted.lock().unwrap(),
MockLayer::Image(this) => this.deleted.lock().unwrap(),
};
*guard
}
fn mark_deleted(&self) {
let mut deleted_guard = match self {
MockLayer::Delta(this) => this.deleted.lock().unwrap(),
MockLayer::Image(this) => this.deleted.lock().unwrap(),
};
assert!(!*deleted_guard, "layer already deleted");
*deleted_guard = true;
}
}
impl From<&Arc<MockDeltaLayer>> for MockLayer {
fn from(l: &Arc<MockDeltaLayer>) -> Self {
MockLayer::Delta(l.clone())
}
}
impl From<&Arc<MockImageLayer>> for MockLayer {
fn from(l: &Arc<MockImageLayer>) -> Self {
MockLayer::Image(l.clone())
}
}
#[async_trait]
impl interface::CompactionJobExecutor for MockTimeline {
type Key = Key;
type Layer = MockLayer;
type DeltaLayer = Arc<MockDeltaLayer>;
type ImageLayer = Arc<MockImageLayer>;
type RequestContext = MockRequestContext;
async fn get_layers(
&mut self,
key_range: &Range<Self::Key>,
lsn_range: &Range<Lsn>,
_ctx: &Self::RequestContext,
) -> anyhow::Result<Vec<Self::Layer>> {
// Clear any deleted layers from our vec
self.live_layers.retain(|l| !l.is_deleted());
let layers: Vec<MockLayer> = self
.live_layers
.iter()
.filter(|l| {
overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
})
.cloned()
.collect();
Ok(layers)
}
async fn get_keyspace(
&mut self,
key_range: &Range<Self::Key>,
_lsn: Lsn,
_ctx: &Self::RequestContext,
) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
// find it in the levels
if self.old_keyspaces.is_empty() {
Ok(crate::helpers::intersect_keyspace(
&self.keyspace,
key_range,
))
} else {
// not implemented
// The mock implementation only allows requesting the
// keyspace at the level's end LSN. That's all that the
// current implementation needs.
panic!("keyspace not available for requested lsn");
}
}
async fn downcast_delta_layer(
&self,
layer: &MockLayer,
) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
Ok(match layer {
MockLayer::Delta(l) => Some(l.clone()),
MockLayer::Image(_) => None,
})
}
async fn create_image(
&mut self,
lsn: Lsn,
key_range: &Range<Key>,
ctx: &MockRequestContext,
) -> anyhow::Result<()> {
let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
let mut accum_size: u64 = 0;
for r in keyspace {
accum_size += r.end - r.start;
}
let new_layer = Arc::new(MockImageLayer {
key_range: key_range.clone(),
lsn_range: lsn..lsn,
file_size: accum_size * 8192,
deleted: Mutex::new(false),
});
info!(
"created image layer, size {}: {}",
new_layer.file_size,
new_layer.short_id()
);
self.live_layers.push(MockLayer::Image(new_layer.clone()));
// update stats
self.bytes_written += new_layer.file_size;
self.layers_created += 1;
self.time += 1;
self.history.push(LayerTraceEvent {
time_rel: self.time,
op: LayerTraceOp::CreateImage,
file: LayerTraceFile {
filename: new_layer.short_id(),
key_range: new_layer.key_range.clone(),
lsn_range: new_layer.lsn_range.clone(),
},
});
Ok(())
}
async fn create_delta(
&mut self,
lsn_range: &Range<Lsn>,
key_range: &Range<Key>,
input_layers: &[Arc<MockDeltaLayer>],
ctx: &MockRequestContext,
) -> anyhow::Result<()> {
let mut key_value_stream =
std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
let mut records: Vec<MockRecord> = Vec::new();
let mut total_len = 2;
while let Some(delta_entry) = key_value_stream.next().await {
let delta_entry: MockRecord = delta_entry?;
if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
total_len += delta_entry.len;
records.push(delta_entry);
}
}
let total_records = records.len();
let new_layer = Arc::new(MockDeltaLayer {
key_range: key_range.clone(),
lsn_range: lsn_range.clone(),
file_size: total_len,
records,
deleted: Mutex::new(false),
});
info!(
"created delta layer, recs {}, size {}: {}",
total_records,
total_len,
new_layer.short_id()
);
self.live_layers.push(MockLayer::Delta(new_layer.clone()));
// update stats
self.bytes_written += total_len;
self.layers_created += 1;
self.time += 1;
self.history.push(LayerTraceEvent {
time_rel: self.time,
op: LayerTraceOp::CreateDelta,
file: LayerTraceFile {
filename: new_layer.short_id(),
key_range: new_layer.key_range.clone(),
lsn_range: new_layer.lsn_range.clone(),
},
});
Ok(())
}
async fn delete_layer(
&mut self,
layer: &Self::Layer,
_ctx: &MockRequestContext,
) -> anyhow::Result<()> {
let layer = std::pin::pin!(layer);
info!("deleting layer: {}", layer.short_id());
self.num_deleted_layers += 1;
self.bytes_deleted += layer.file_size();
layer.mark_deleted();
self.time += 1;
self.history.push(LayerTraceEvent {
time_rel: self.time,
op: LayerTraceOp::Delete,
file: LayerTraceFile {
filename: layer.short_id(),
key_range: layer.key_range().clone(),
lsn_range: layer.lsn_range().clone(),
},
});
Ok(())
}
}

View File

@@ -1,411 +0,0 @@
use super::Key;
use anyhow::Result;
use std::cmp::Ordering;
use std::{
collections::{BTreeMap, BTreeSet, HashSet},
fmt::Write,
ops::Range,
};
use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
use utils::lsn::Lsn;
// Map values to their compressed coordinate - the index the value
// would have in a sorted and deduplicated list of all values.
struct CoordinateMap<T: Ord + Copy> {
map: BTreeMap<T, usize>,
stretch: f32,
}
impl<T: Ord + Copy> CoordinateMap<T> {
fn new(coords: Vec<T>, stretch: f32) -> Self {
let set: BTreeSet<T> = coords.into_iter().collect();
let mut map: BTreeMap<T, usize> = BTreeMap::new();
for (i, e) in set.iter().enumerate() {
map.insert(*e, i);
}
Self { map, stretch }
}
// This assumes that the map contains an exact point for this.
// Use map_inexact for values inbetween
fn map(&self, val: T) -> f32 {
*self.map.get(&val).unwrap() as f32 * self.stretch
}
// the value is still assumed to be within the min/max bounds
// (this is currently unused)
fn _map_inexact(&self, val: T) -> f32 {
let prev = *self.map.range(..=val).next().unwrap().1;
let next = *self.map.range(val..).next().unwrap().1;
// interpolate
(prev as f32 + (next - prev) as f32) * self.stretch
}
fn max(&self) -> f32 {
self.map.len() as f32 * self.stretch
}
}
#[derive(PartialEq, Hash, Eq)]
pub enum LayerTraceOp {
Flush,
CreateDelta,
CreateImage,
Delete,
}
impl std::fmt::Display for LayerTraceOp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
let op_str = match self {
LayerTraceOp::Flush => "flush",
LayerTraceOp::CreateDelta => "create_delta",
LayerTraceOp::CreateImage => "create_image",
LayerTraceOp::Delete => "delete",
};
f.write_str(op_str)
}
}
#[derive(PartialEq, Hash, Eq, Clone)]
pub struct LayerTraceFile {
pub filename: String,
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
}
impl LayerTraceFile {
fn is_image(&self) -> bool {
self.lsn_range.end == self.lsn_range.start
}
}
pub struct LayerTraceEvent {
pub time_rel: u64,
pub op: LayerTraceOp,
pub file: LayerTraceFile,
}
pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
let mut files: Vec<LayerTraceFile> = Vec::new();
for event in history {
files.push(event.file.clone());
}
let last_time_rel = history.last().unwrap().time_rel;
// Collect all coordinates
let mut keys: Vec<Key> = vec![];
let mut lsns: Vec<Lsn> = vec![];
for f in files.iter() {
keys.push(f.key_range.start);
keys.push(f.key_range.end);
lsns.push(f.lsn_range.start);
lsns.push(f.lsn_range.end);
}
// Analyze
let key_map = CoordinateMap::new(keys, 2.0);
// Stretch out vertically for better visibility
let lsn_map = CoordinateMap::new(lsns, 3.0);
let mut svg = String::new();
// Draw
writeln!(
svg,
"{}",
BeginSvg {
w: key_map.max(),
h: lsn_map.max(),
}
)?;
let lsn_max = lsn_map.max();
// Sort the files by LSN, but so that image layers go after all delta layers
// The SVG is painted in the order the elements appear, and we want to draw
// image layers on top of the delta layers if they overlap
//
// (This could also be implemented via z coordinates: image layers get one z
// coord, delta layers get another z coord.)
let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
files_sorted.sort_by(|a, b| {
if a.is_image() && !b.is_image() {
Ordering::Greater
} else if !a.is_image() && b.is_image() {
Ordering::Less
} else {
a.lsn_range.end.cmp(&b.lsn_range.end)
}
});
writeln!(svg, "<!-- layers -->")?;
let mut files_seen = HashSet::new();
for f in files_sorted {
if files_seen.contains(&f) {
continue;
}
let key_start = key_map.map(f.key_range.start);
let key_end = key_map.map(f.key_range.end);
let key_diff = key_end - key_start;
if key_start >= key_end {
panic!("Invalid key range {}-{}", key_start, key_end);
}
let lsn_start = lsn_map.map(f.lsn_range.start);
let lsn_end = lsn_map.map(f.lsn_range.end);
// Fill in and thicken rectangle if it's an
// image layer so that we can see it.
let mut style = Style::default();
style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
let y_start = lsn_max - lsn_start;
let y_end = lsn_max - lsn_end;
let x_margin = 0.25;
let y_margin = 0.5;
match f.lsn_range.start.cmp(&f.lsn_range.end) {
Ordering::Less => {
write!(
svg,
r#" <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
f.filename,
key_start + x_margin,
y_end + y_margin,
key_diff - x_margin * 2.0,
y_start - y_end - y_margin * 2.0,
1.0, // border_radius,
style,
)?;
write!(svg, "<title>{}</title>", f.filename)?;
writeln!(svg, "</rect>")?;
}
Ordering::Equal => {
//lsn_diff = 0.3;
//lsn_offset = -lsn_diff / 2.0;
//margin = 0.05;
style.fill = Fill::Color(rgb(0x80, 0, 0x80));
style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
write!(
svg,
r#" <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
f.filename,
key_start + x_margin,
y_end,
key_end - x_margin,
y_end,
style,
)?;
write!(
svg,
"<title>{}<br>{} - {}</title>",
f.filename, lsn_end, y_end
)?;
writeln!(svg, "</line>")?;
}
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
}
files_seen.insert(f);
}
let mut record_style = Style::default();
record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
record_style.stroke = Stroke::None;
writeln!(svg, "{}", EndSvg)?;
let mut layer_events_str = String::new();
let mut first = true;
for e in history {
if !first {
writeln!(layer_events_str, ",")?;
}
write!(
layer_events_str,
r#" {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
e.time_rel, e.file.filename, e.op
)?;
first = false;
}
writeln!(layer_events_str)?;
writeln!(
output,
r#"<!DOCTYPE html>
<html>
<head>
<style>
/* Keep the slider pinned at top */
.topbar {{
display: block;
overflow: hidden;
background-color: lightgrey;
position: fixed;
top: 0;
width: 100%;
/* width: 500px; */
}}
.slidercontainer {{
float: left;
width: 50%;
margin-right: 200px;
}}
.slider {{
float: left;
width: 100%;
}}
.legend {{
width: 200px;
float: right;
}}
/* Main content */
.main {{
margin-top: 50px; /* Add a top margin to avoid content overlay */
}}
</style>
</head>
<body onload="init()">
<script type="text/javascript">
var layer_events = [{layer_events_str}]
let ticker;
function init() {{
for (let i = 0; i < layer_events.length; i++) {{
var layer = document.getElementById("layer_" + layer_events[i].filename);
layer.style.visibility = "hidden";
}}
last_layer_event = -1;
moveSlider(last_slider_pos)
}}
function startAnimation() {{
ticker = setInterval(animateStep, 100);
}}
function stopAnimation() {{
clearInterval(ticker);
}}
function animateStep() {{
if (last_layer_event < layer_events.length - 1) {{
var slider = document.getElementById("time-slider");
let prevPos = slider.value
let nextEvent = last_layer_event + 1
while (nextEvent <= layer_events.length - 1) {{
if (layer_events[nextEvent].time_rel > prevPos) {{
break;
}}
nextEvent += 1;
}}
let nextPos = layer_events[nextEvent].time_rel
slider.value = nextPos
moveSlider(nextPos)
}}
}}
function redoLayerEvent(n, dir) {{
var layer = document.getElementById("layer_" + layer_events[n].filename);
switch (layer_events[n].op) {{
case "flush":
layer.style.visibility = "visible";
break;
case "create_delta":
layer.style.visibility = "visible";
break;
case "create_image":
layer.style.visibility = "visible";
break;
case "delete":
layer.style.visibility = "hidden";
break;
}}
}}
function undoLayerEvent(n) {{
var layer = document.getElementById("layer_" + layer_events[n].filename);
switch (layer_events[n].op) {{
case "flush":
layer.style.visibility = "hidden";
break;
case "create_delta":
layer.style.visibility = "hidden";
break;
case "create_image":
layer.style.visibility = "hidden";
break;
case "delete":
layer.style.visibility = "visible";
break;
}}
}}
var last_slider_pos = 0
var last_layer_event = 0
var moveSlider = function(new_pos) {{
if (new_pos > last_slider_pos) {{
while (last_layer_event < layer_events.length - 1) {{
if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
break;
}}
last_layer_event += 1;
redoLayerEvent(last_layer_event)
}}
}}
if (new_pos < last_slider_pos) {{
while (last_layer_event >= 0) {{
if (layer_events[last_layer_event].time_rel <= new_pos) {{
break;
}}
undoLayerEvent(last_layer_event)
last_layer_event -= 1;
}}
}}
last_slider_pos = new_pos;
document.getElementById("debug_pos").textContent=new_pos;
if (last_layer_event >= 0) {{
document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
}} else {{
document.getElementById("debug_layer_event").textContent="begin";
}}
}}
</script>
<div class="topbar">
<div class="slidercontainer">
<label for="time-slider">TIME</label>:
<input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
pos: <span id="debug_pos"></span><br>
event: <span id="debug_layer_event"></span><br>
gc: <span id="debug_gc_event"></span><br>
</div>
<button onclick="startAnimation()">Play</button>
<button onclick="stopAnimation()">Stop</button>
<svg class="legend">
<rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
<line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
<line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
</svg>
</div>
<div class="main">
{svg}
</div>
</body>
</html>
"#
)?;
Ok(())
}

View File

@@ -1,35 +0,0 @@
use pageserver_compaction::interface::CompactionLayer;
use pageserver_compaction::simulator::MockTimeline;
/// Test the extreme case that there are so many updates for a single key that
/// even if we produce an extremely narrow delta layer, spanning just that one
/// key, we still too many records to fit in the target file size. We need to
/// split in the LSN dimension too in that case.
///
/// TODO: The code to avoid this problem has not been implemented yet! So the
/// assertion currently fails, but we need to make it not fail.
#[ignore]
#[tokio::test]
async fn test_many_updates_for_single_key() {
let mut executor = MockTimeline::new();
executor.target_file_size = 10_000_000; // 10 MB
// Ingest 100 MB of updates to a single key.
for _ in 1..1000 {
executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
executor.compact().await.unwrap();
}
// Check that all the layers are smaller than the target size (with some slop)
for l in executor.live_layers.iter() {
println!("layer {}: {}", l.short_id(), l.file_size());
}
for l in executor.live_layers.iter() {
assert!(l.file_size() < executor.target_file_size * 2);
// sanity check that none of the delta layers are stupidly small either
if l.is_delta() {
assert!(l.file_size() > executor.target_file_size / 2);
}
}
}

View File

@@ -12,7 +12,7 @@ use std::collections::BinaryHeap;
use std::ops::Range;
use std::{fs, str};
use pageserver::page_cache::{self, PAGE_SZ};
use pageserver::page_cache::PAGE_SZ;
use pageserver::repository::{Key, KEY_SIZE};
use pageserver::tenant::block_io::FileBlockReader;
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
@@ -100,15 +100,13 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
let file = VirtualFile::open(path).await?;
let file_id = page_cache::next_file_id();
let block_reader = FileBlockReader::new(&file, file_id);
let summary_blk = block_reader.read_blk(0, ctx).await?;
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let summary_blk = file.read_blk(0, ctx).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
actual_summary.index_start_blk,
actual_summary.index_root_blk,
block_reader,
file,
);
// min-heap (reserve space for one more element added before eviction)
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);

View File

@@ -61,15 +61,13 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
page_cache::init(100);
let file = VirtualFile::open(path).await?;
let file_id = page_cache::next_file_id();
let block_reader = FileBlockReader::new(&file, file_id);
let summary_blk = block_reader.read_blk(0, ctx).await?;
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let summary_blk = file.read_blk(0, ctx).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
actual_summary.index_start_blk,
actual_summary.index_root_blk,
&block_reader,
&file,
);
// TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
let mut all = vec![];
@@ -85,7 +83,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
ctx,
)
.await?;
let cursor = BlockCursor::new_fileblockreader(&block_reader);
let cursor = BlockCursor::new_fileblockreader(&file);
for (k, v) in all {
let value = cursor.read_blob(v.pos(), ctx).await?;
println!("key:{} value_len:{}", k, value.len());

Some files were not shown because too many files have changed in this diff Show More