Compare commits

..

8 Commits

Author SHA1 Message Date
Arpad Müller
7bf395df76 time.sleep 2024-01-23 02:21:49 +01:00
Arpad Müller
24bc6ddec4 Just add this wait to prevent a race 2024-01-23 01:58:57 +01:00
Arpad Müller
f49fe734d1 Allow this for a test 2024-01-23 00:28:22 +01:00
Arpad Müller
872e645f7d Disable cancellation support in initdb 2024-01-22 23:47:08 +01:00
Arpad Müller
648fe7c92d Add it to the allowed errors 2024-01-22 18:26:47 +01:00
Arpad Müller
21045477a3 Allow this msg 2024-01-22 15:20:18 +01:00
Arpad Müller
125f24ca49 exit initdb via kill and then await it 2024-01-22 15:19:13 +01:00
Arpad Müller
443d4ce868 Duplicate the test to try to reproduce the issue 2024-01-22 15:19:13 +01:00
391 changed files with 10385 additions and 30599 deletions

View File

@@ -1,27 +1,27 @@
*
# Files
!Cargo.lock
!Cargo.toml
!Makefile
!rust-toolchain.toml
!scripts/combine_control_files.py
!scripts/ninstall.sh
!vm-cgconfig.conf
!Cargo.toml
!Cargo.lock
!Makefile
# Directories
!.cargo/
!.config/
!compute_tools/
!control_plane/
!compute_tools/
!libs/
!neon_local/
!pageserver/
!pgxn/
!proxy/
!s3_scrubber/
!safekeeper/
!s3_scrubber/
!storage_broker/
!trace/
!vendor/postgres-*/
!vendor/postgres-v14/
!vendor/postgres-v15/
!vendor/postgres-v16/
!workspace_hack/
!neon_local/
!scripts/ninstall.sh
!scripts/combine_control_files.py
!vm-cgconfig.conf

View File

@@ -4,8 +4,6 @@ self-hosted-runner:
- dev
- gen3
- large
# Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
- macos-14
- small
- us-east-2
config-variables:

View File

@@ -59,7 +59,7 @@ runs:
BUCKET: neon-github-public-dev
# TODO: We can replace with a special docker image with Java and Allure pre-installed
- uses: actions/setup-java@v4
- uses: actions/setup-java@v3
with:
distribution: 'temurin'
java-version: '17'
@@ -179,11 +179,22 @@ runs:
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
fi
- name: Cache poetry deps
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
- name: Store Allure test stat in the DB
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
shell: bash -euxo pipefail {0}
env:
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
run: |
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
./scripts/pysync
poetry run python3 scripts/ingest_regress_test_result.py \
--revision ${COMMIT_SHA} \
--reference ${GITHUB_REF} \
--build-type unified \
--ingest ${WORKDIR}/report/data/suites.json
- name: Store Allure test stat in the DB (new)
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
@@ -215,7 +226,7 @@ runs:
rm -rf ${WORKDIR}
fi
- uses: actions/github-script@v7
- uses: actions/github-script@v6
if: always()
env:
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}

View File

@@ -44,10 +44,6 @@ inputs:
description: 'Postgres version to use for tests'
required: false
default: 'v14'
benchmark_durations:
description: 'benchmark durations JSON'
required: false
default: '{}'
runs:
using: "composite"
@@ -80,16 +76,17 @@ runs:
- name: Checkout
if: inputs.needs_postgres_source == 'true'
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Cache poetry deps
uses: actions/cache@v4
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
shell: bash -euxo pipefail {0}
@@ -163,7 +160,7 @@ runs:
# We use pytest-split plugin to run benchmarks in parallel on different CI runners
if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
mkdir -p $TEST_OUTPUT
echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json
poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
fi

View File

@@ -17,7 +17,6 @@ concurrency:
jobs:
actionlint:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

View File

@@ -64,7 +64,7 @@ jobs:
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
- uses: actions/checkout@v4
- uses: actions/checkout@v3
with:
ref: main
token: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -93,7 +93,6 @@ jobs:
--body-file "body.md" \
--head "${BRANCH}" \
--base "main" \
--label "run-e2e-tests-in-draft" \
--draft
fi

View File

@@ -66,7 +66,7 @@ jobs:
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -221,7 +221,7 @@ jobs:
timeout-minutes: 480
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -366,7 +366,7 @@ jobs:
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -465,7 +465,7 @@ jobs:
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -562,7 +562,7 @@ jobs:
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download

View File

@@ -69,15 +69,7 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: |
/kaniko/executor \
--reproducible \
--snapshotMode=redo \
--skip-unused-stages \
--dockerfile ${{ inputs.dockerfile-path }} \
--cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
kaniko-arm:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -93,15 +85,7 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: |
/kaniko/executor \
--reproducible \
--snapshotMode=redo \
--skip-unused-stages \
--dockerfile ${{ inputs.dockerfile-path }} \
--cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
manifest:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -115,10 +99,7 @@ jobs:
steps:
- name: Create manifest
run: |
docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
- name: Push manifest
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}

View File

@@ -21,13 +21,11 @@ env:
COPT: '-Werror'
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
jobs:
check-permissions:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
runs-on: ubuntu-latest
steps:
- name: Disallow PRs from forks
if: |
@@ -46,20 +44,6 @@ jobs:
exit 1
cancel-previous-e2e-tests:
needs: [ check-permissions ]
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- name: Cancel previous e2e-tests runs for this PR
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
gh workflow --repo neondatabase/cloud \
run cancel-previous-in-concurrency-group.yml \
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
tag:
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, small ]
@@ -69,7 +53,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -106,16 +90,17 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: false
fetch-depth: 1
- name: Cache poetry deps
uses: actions/cache@v4
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
run: ./scripts/pysync
@@ -131,14 +116,14 @@ jobs:
check-codestyle-rust:
needs: [ check-permissions, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
@@ -146,7 +131,7 @@ jobs:
# Disabled for now
# - name: Restore cargo deps cache
# id: cache_cargo
# uses: actions/cache@v4
# uses: actions/cache@v3
# with:
# path: |
# !~/.cargo/registry/src
@@ -201,11 +186,7 @@ jobs:
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# Raise locked memory limit for tokio-epoll-uring.
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
# io_uring will account the memory of the CQ and SQ as locked.
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
options: --init
strategy:
fail-fast: false
matrix:
@@ -231,7 +212,7 @@ jobs:
done
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
@@ -253,7 +234,7 @@ jobs:
done
if [ "${FAILED}" = "true" ]; then
echo >&2 "Please update vendor/revisions.json if these changes are intentional"
echo >&2 "Please update vendors/revisions.json if these changes are intentional"
exit 1
fi
@@ -303,7 +284,7 @@ jobs:
# compressed crates.
# - name: Cache cargo deps
# id: cache_cargo
# uses: actions/cache@v4
# uses: actions/cache@v3
# with:
# path: |
# ~/.cargo/registry/
@@ -317,21 +298,21 @@ jobs:
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -359,12 +340,8 @@ jobs:
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run rust tests
env:
NEXTEST_RETRIES: 3
run: |
for io_engine in std-fs tokio-epoll-uring ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
done
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -442,8 +419,8 @@ jobs:
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
# Default shared memory is 64mb
options: --init --shm-size=512mb
strategy:
fail-fast: false
matrix:
@@ -451,7 +428,7 @@ jobs:
pg_version: [ v14, v15, v16 ]
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
@@ -471,65 +448,27 @@ jobs:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
- name: Merge and upload coverage data
if: |
false &&
matrix.build_type == 'debug' && matrix.pg_version == 'v14'
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
uses: ./.github/actions/save-coverage-data
get-benchmarks-durations:
outputs:
json: ${{ steps.get-benchmark-durations.outputs.json }}
needs: [ check-permissions, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Cache poetry deps
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
run: ./scripts/pysync
- name: get benchmark durations
id: get-benchmark-durations
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
run: |
poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \
--days 10 \
--output /tmp/benchmark_durations.json
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
benchmarks:
needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
needs: [ check-permissions, build-neon, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
# Default shared memory is 64mb
options: --init --shm-size=512mb
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
strategy:
fail-fast: false
matrix:
# the amount of groups (N) should be reflected in `extra_params: --splits N ...`
pytest_split_group: [ 1, 2, 3, 4, 5 ]
pytest_split_group: [ 1, 2, 3, 4 ]
build_type: [ release ]
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Pytest benchmarks
uses: ./.github/actions/run-python-test-set
@@ -538,13 +477,11 @@ jobs:
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ github.ref_name == 'main' }}
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
@@ -558,7 +495,7 @@ jobs:
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Create Allure report
if: ${{ !cancelled() }}
@@ -567,9 +504,10 @@ jobs:
with:
store-test-results-into-db: true
env:
REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- uses: actions/github-script@v7
- uses: actions/github-script@v6
if: ${{ !cancelled() }}
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -609,7 +547,7 @@ jobs:
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
@@ -644,6 +582,17 @@ jobs:
--input-objects=/tmp/coverage/binaries.list \
--format=lcov
- name: Upload coverage report
id: upload-coverage-report
env:
BUCKET: neon-github-public-dev
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
- name: Build coverage report NEW
id: upload-coverage-report-new
env:
@@ -678,13 +627,23 @@ jobs:
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
- uses: actions/github-script@v7
- uses: actions/github-script@v6
env:
REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
script: |
const { REPORT_URL_NEW, COMMIT_SHA } = process.env
const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: `${COMMIT_SHA}`,
state: 'success',
target_url: `${REPORT_URL}`,
context: 'Code coverage report',
})
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
@@ -696,10 +655,49 @@ jobs:
})
trigger-e2e-tests:
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
needs: [ check-permissions, promote-images, tag ]
uses: ./.github/workflows/trigger-e2e-tests.yml
secrets: inherit
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
steps:
- name: Set PR's status to pending and request a remote CI test
run: |
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
# to place a job run status update later.
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
REMOTE_REPO="${{ github.repository_owner }}/cloud"
curl -f -X POST \
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"state\": \"pending\",
\"context\": \"neon-cloud-e2e\",
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
}"
curl -f -X POST \
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"ref\": \"main\",
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
}
}"
neon-image:
needs: [ check-permissions, build-buildtools-image, tag ]
@@ -868,7 +866,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.23.2
VM_BUILDER_VERSION: v0.21.0
steps:
- name: Checkout
@@ -904,7 +902,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -1118,7 +1116,7 @@ jobs:
done
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: false
fetch-depth: 0
@@ -1141,7 +1139,7 @@ jobs:
- name: Create git tag
if: github.ref_name == 'release'
uses: actions/github-script@v7
uses: actions/github-script@v6
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
@@ -1155,7 +1153,7 @@ jobs:
- name: Create GitHub release
if: github.ref_name == 'release'
uses: actions/github-script@v7
uses: actions/github-script@v6
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
@@ -1204,80 +1202,3 @@ jobs:
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
done
compute-node-image-merged-base:
needs: [ check-permissions, build-buildtools-image, tag ]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure Docker Hub login
run: |
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
echo "::add-mask::${DOCKERHUB_AUTH}"
cat <<-EOF > ~/.docker/config.json
{
"auths": {
"https://index.docker.io/v1/": {
"auth": "${DOCKERHUB_AUTH}"
}
}
}
EOF
- name: Build merged image base
run: |
docker image build . -f Dockerfile.compute-node-simple -t neondatabase/tmp-compute-node-merged-base-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--build-arg PG_VERSION=${{ matrix.version }} \
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
docker image push neondatabase/tmp-compute-node-merged-base-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
compute-node-image-merged:
needs: [ tag, compute-node-image-merged-base ]
runs-on: ubuntu-latest
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure Docker Hub login
run: |
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
echo "::add-mask::${DOCKERHUB_AUTH}"
echo ~
cat <<-EOF > ~/.docker/config.json
{
"auths": {
"https://index.docker.io/v1/": {
"auth": "${DOCKERHUB_AUTH}"
}
}
}
EOF
- name: Build merged image
run: |
docker image build . -f Dockerfile.compute-node-merged -t neondatabase/tmp-compute-node-merged:${{needs.tag.outputs.build-tag}} \
--build-arg TAG=${{needs.tag.outputs.build-tag}}
docker image push neondatabase/tmp-compute-node-merged:${{needs.tag.outputs.build-tag}}

View File

@@ -26,7 +26,7 @@ jobs:
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
timeout-minutes: 90
runs-on: macos-14
runs-on: macos-latest
env:
# Use release build only, to have less debug info around
@@ -57,24 +57,24 @@ jobs:
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Set extra env for macOS
run: |
@@ -82,14 +82,14 @@ jobs:
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Cache cargo deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
@@ -110,14 +110,13 @@ jobs:
run: make walproposer-lib -j$(sysctl -n hw.ncpu)
- name: Run cargo build
run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release
run: cargo build --all --release
- name: Check that no warnings are produced
run: ./run_clippy.sh
check-linux-arm-build:
timeout-minutes: 90
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
runs-on: [ self-hosted, dev, arm64 ]
env:
@@ -125,12 +124,12 @@ jobs:
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
CARGO_FEATURES: --features testing
CARGO_FLAGS: --release
CARGO_FLAGS: --locked --release
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -172,21 +171,21 @@ jobs:
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -211,20 +210,18 @@ jobs:
- name: Run cargo build
run: |
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run cargo test
env:
NEXTEST_RETRIES: 3
run: |
cargo nextest run $CARGO_FEATURES
cargo test $CARGO_FLAGS $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_s3
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
@@ -234,11 +231,10 @@ jobs:
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_azure
cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
check-codestyle-rust-arm:
timeout-minutes: 90
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
runs-on: [ self-hosted, dev, arm64 ]
container:
@@ -356,7 +352,7 @@ jobs:
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
- name: Publish build stats report
uses: actions/github-script@v7
uses: actions/github-script@v6
env:
REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
SHA: ${{ github.event.pull_request.head.sha || github.sha }}

View File

@@ -28,7 +28,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
@@ -38,10 +38,11 @@ jobs:
uses: snok/install-poetry@v1
- name: Cache poetry deps
uses: actions/cache@v4
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
shell: bash -euxo pipefail {0}
@@ -82,7 +83,7 @@ jobs:
# It will be fixed after switching to gen2 runner
- name: Upload python test logs
if: always()
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
retention-days: 7
name: python-test-pg_clients-${{ runner.os }}-stage-logs

View File

@@ -1,117 +0,0 @@
name: Trigger E2E Tests
on:
pull_request:
types:
- ready_for_review
workflow_call:
defaults:
run:
shell: bash -euxo pipefail {0}
env:
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
jobs:
cancel-previous-e2e-tests:
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- name: Cancel previous e2e-tests runs for this PR
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
gh workflow --repo neondatabase/cloud \
run cancel-previous-in-concurrency-group.yml \
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
tag:
runs-on: [ ubuntu-latest ]
outputs:
build-tag: ${{ steps.build-tag.outputs.tag }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Get build tag
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }}
CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
fi
id: build-tag
trigger-e2e-tests:
needs: [ tag ]
runs-on: [ self-hosted, gen3, small ]
env:
TAG: ${{ needs.tag.outputs.build-tag }}
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
steps:
- name: check if ecr image are present
run: |
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
if [ "$OUTPUT" == "" ]; then
echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
exit 1
fi
done
- name: Set PR's status to pending and request a remote CI test
run: |
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
# to place a job run status update later.
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
REMOTE_REPO="${{ github.repository_owner }}/cloud"
curl -f -X POST \
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"state\": \"pending\",
\"context\": \"neon-cloud-e2e\",
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
}"
curl -f -X POST \
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"ref\": \"main\",
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${TAG}\",
\"compute_image_tag\": \"${TAG}\",
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
}
}"

View File

@@ -20,51 +20,111 @@ defaults:
run:
shell: bash -euo pipefail {0}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
permissions: {}
jobs:
tag-image:
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
env:
ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
outputs:
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
steps:
- name: Install Crane & ECR helper
run: |
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Get source image digest
id: next-digest
run: |
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
if [ -z "${NEXT_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
exit 1
fi
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
- name: Get destination image digest (if already exists)
id: prev-digest
run: |
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
if [ -z "${PREV_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
else
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
fi
- name: Tag image
run: |
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
rollback-tag-image:
needs: tag-image
if: ${{ !success() }}
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
env:
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
steps:
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
- name: Install Crane & ECR helper
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
- uses: docker/login-action@v2
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v2
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- uses: actions/setup-go@v5
with:
go-version: '1.21'
- name: Install crane
- name: Configure ECR login
run: |
go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Copy images
- name: Restore previous tag if needed
run: |
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
if [ -z "${NEXT_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
exit 0
fi
if [ -z "${PREV_DIGEST}" ]; then
# I guess we should delete the tag here/untag the image, but crane does not support it
# - https://github.com/google/go-containerregistry/issues/999
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
exit 0
fi
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
else
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
fi

View File

@@ -20,7 +20,7 @@ ln -s ../../pre-commit.py .git/hooks/pre-commit
This will run following checks on staged files before each commit:
- `rustfmt`
- checks for Python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
@@ -54,9 +54,6 @@ _An instruction for maintainers_
- If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then:
- Press the "Approve and run" button in GitHub UI
- Add the `approved-for-ci-run` label to the PR
- Currently draft PR will skip e2e test (only for internal contributors). After turning the PR 'Ready to Review' CI will trigger e2e test
- Add `run-e2e-tests-in-draft` label to run e2e test in draft PR (override above behaviour)
- The `approved-for-ci-run` workflow will add `run-e2e-tests-in-draft` automatically to run e2e test for external contributors
Repeat all steps after any change to the PR.
- When the changes are ready to get merged — merge the original PR (not the internal one)

718
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -18,7 +18,6 @@ members = [
"libs/pageserver_api",
"libs/postgres_ffi",
"libs/safekeeper_api",
"libs/desim",
"libs/utils",
"libs/consumption_metrics",
"libs/postgres_backend",
@@ -49,12 +48,11 @@ azure_storage_blobs = "0.18"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.14"
aws-sdk-secretsmanager = { version = "1.14.0" }
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.1.4"
aws-credential-types = "1.1.4"
aws-config = { version = "1.0", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.0"
aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.0"
aws-credential-types = "1.0"
axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
@@ -66,6 +64,7 @@ camino = "1.1.6"
cfg-if = "1.0.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive"] }
close_fds = "0.3.2"
comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
@@ -81,7 +80,7 @@ futures-core = "0.3"
futures-util = "0.3"
git-version = "0.3"
hashbrown = "0.13"
hashlink = "0.8.4"
hashlink = "0.8.1"
hdrhistogram = "7.5.2"
hex = "0.4"
hex-literal = "0.4"
@@ -96,33 +95,30 @@ inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
jsonwebtoken = "9"
lasso = "0.7"
leaky-bucket = "1.0.1"
libc = "0.2"
md5 = "0.7.0"
memoffset = "0.8"
native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
notify = "6.0.0"
nix = "0.26"
notify = "5.0.0"
num_cpus = "1.15"
num-traits = "0.2.15"
once_cell = "1.13"
opentelemetry = "0.20.0"
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.12.0"
opentelemetry = "0.19.0"
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.11.0"
parking_lot = "0.12"
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "49.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
procfs = "0.14"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
rand = "0.8"
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
reqwest-middleware = "0.2.0"
reqwest-retry = "0.2.2"
routerify = "3"
@@ -153,11 +149,8 @@ tar = "0.4"
task-local-extensions = "0.1.4"
test-context = "0.1"
thiserror = "1.0"
tikv-jemallocator = "0.5"
tikv-jemalloc-ctl = "0.5"
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
tokio = { version = "1.17", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.10.0"
tokio-rustls = "0.24"
@@ -169,11 +162,9 @@ toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.20.0"
tracing-opentelemetry = "0.19.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
twox-hash = { version = "1.6.3", default-features = false }
url = "2.2"
urlencoding = "2.1"
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"
webpki-roots = "0.25"
@@ -205,7 +196,6 @@ postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
desim = { version = "0.1", path = "./libs/desim" }
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }

View File

@@ -47,13 +47,12 @@ COPY --chown=nonroot . .
# Show build caching stats to check if it was used in the end.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
&& mold -run cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \
--bin safekeeper \
--bin storage_broker \
--bin attachment_service \
--bin proxy \
--bin neon_local \
--locked --release \
@@ -81,7 +80,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
@@ -100,11 +98,6 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
-c "listen_pg_addr='0.0.0.0:6400'" \
-c "listen_http_addr='0.0.0.0:9898'"
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
# that want a particular postgres version will select it explicitly: this is just a default.
ENV LD_LIBRARY_PATH /usr/local/v16/lib
VOLUME ["/data"]
USER neon
EXPOSE 6400

View File

@@ -111,7 +111,7 @@ USER nonroot:nonroot
WORKDIR /home/nonroot
# Python
ENV PYTHON_VERSION=3.9.18 \
ENV PYTHON_VERSION=3.9.2 \
PYENV_ROOT=/home/nonroot/.pyenv \
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
RUN set -e \
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.76.0
ENV RUSTC_VERSION=1.75.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \

View File

@@ -52,7 +52,7 @@ RUN cd postgres && \
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
# In vanilla postgres this function is limited to Postgres role superuser.
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
# so we do it here.
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
@@ -63,14 +63,14 @@ RUN cd postgres && \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
fi; \
done; \
# the second loop is for pg_stat_statement extension versions >= 1.7,
# the second loop is for pg_stat_statement extension versions >= 1.7,
# where pg_stat_statement_reset() got 3 additional arguments
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
if ! echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
fi; \
done
done
#########################################################################################
#
@@ -144,23 +144,30 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
FROM build-deps AS plv8-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ARG PG_VERSION
RUN apt update && \
apt install -y ninja-build python3-dev libncurses5 binutils clang
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export PLV8_VERSION=3.1.5 \
export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
;; \
"v16") \
export PLV8_VERSION=3.1.8 \
export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
;; \
*) \
echo "Export the valid PG_VERSION variable" && exit 1 \
;; \
esac && \
wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
# generate and copy upgrade scripts
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
cp upgrade/* /usr/local/pgsql/share/extension/ && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
rm -rf /plv8-* && \
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
# don't break computes with installed old version of plv8
cd /usr/local/pgsql/lib/ && \
ln -s plv8-3.1.10.so plv8-3.1.5.so && \
ln -s plv8-3.1.10.so plv8-3.1.8.so && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -520,7 +527,8 @@ RUN apt-get update && \
libboost-regex1.74-dev \
libboost-serialization1.74-dev \
libboost-system1.74-dev \
libeigen3-dev
libeigen3-dev \
libfreetype6-dev
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
@@ -545,8 +553,6 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
-D RDK_INSTALL_INTREE=OFF \
-D RDK_INSTALL_COMIC_FONTS=OFF \
-D RDK_BUILD_FREETYPE_SUPPORT=OFF \
-D CMAKE_BUILD_TYPE=Release \
. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -639,8 +645,8 @@ FROM build-deps AS pg-anon-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -769,24 +775,6 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
#########################################################################################
#
# Layer "pg_ivm"
# compile pg_ivm extension
#
#########################################################################################
FROM build-deps AS pg-ivm-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
@@ -827,8 +815,6 @@ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -839,10 +825,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_test_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_rmgr \
@@ -925,7 +907,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libxml2, libxslt1.1 for xml2
# libzstd1 for zstd
# libboost* for rdkit
# libboost*, libfreetype6, and zlib1g for rdkit
# ca-certificates for communicating with s3 by compute_ctl
RUN apt update && \
apt install --no-install-recommends -y \
@@ -938,6 +920,7 @@ RUN apt update && \
libboost-serialization1.74.0 \
libboost-system1.74.0 \
libossp-uuid16 \
libfreetype6 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
@@ -949,6 +932,7 @@ RUN apt update && \
libcurl4-openssl-dev \
locales \
procps \
zlib1g \
ca-certificates && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

View File

@@ -1,83 +0,0 @@
ARG TAG
FROM neondatabase/tmp-compute-node-merged-base-v14:$TAG as pg14
FROM neondatabase/tmp-compute-node-merged-base-v15:$TAG as pg15
FROM neondatabase/tmp-compute-node-merged-base-v16:$TAG as pg16
#########################################################################################
#
# Compile and run the Neon-specific `compute_ctl` binary
#
#########################################################################################
FROM neondatabase/build-tools:pinned AS compute-tools
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#
# Final layer
# Put it all together into the final image
#
#########################################################################################
FROM debian:bullseye-slim
ARG TAG
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
mkdir /var/db/postgres/pgbouncer && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute && \
chmod 0750 /var/db/postgres/pgbouncer && \
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
# create folder for file cache
mkdir -p -m 777 /neon/cache
COPY --from=pg14 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v14
COPY --from=pg15 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v15
COPY --from=pg16 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v16
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
# Install:
# libreadline8 for psql
# libicu67, locales for collations (including ICU and plpgsql_check)
# liblz4-1 for lz4
# libossp-uuid16 for extension ossp-uuid
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libxml2, libxslt1.1 for xml2
# libzstd1 for zstd
# libboost* for rdkit
# ca-certificates for communicating with s3 by compute_ctl
RUN apt update && \
apt install --no-install-recommends -y \
gdb \
libicu67 \
liblz4-1 \
libreadline8 \
libboost-iostreams1.74.0 \
libboost-regex1.74.0 \
libboost-serialization1.74.0 \
libboost-system1.74.0 \
libossp-uuid16 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
libprotobuf-c1 \
libsfcgal1 \
libxml2 \
libxslt1.1 \
libzstd1 \
libcurl4-openssl-dev \
locales \
procps \
ca-certificates && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
ENV LANG en_US.utf8
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -1,159 +0,0 @@
ARG PG_VERSION
ARG REPOSITORY=neondatabase
ARG IMAGE=build-tools
ARG TAG=pinned
ARG BUILD_TAG
#########################################################################################
#
# Layer "build-deps"
#
#########################################################################################
FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
#########################################################################################
#
# Layer "pg-build"
# Build Postgres from the neon postgres repository.
#
#########################################################################################
FROM build-deps AS pg-build
ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION} postgres
RUN cd postgres && \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
--with-icu --with-libxml --with-libxslt --with-lz4" && \
if [ "${PG_VERSION}" != "v14" ]; then \
# zstd is available only from PG15
export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
fi && \
eval $CONFIGURE_CMD && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
# Enable some of contrib extensions
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
# In vanilla postgres this function is limited to Postgres role superuser.
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
# so we do it here.
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
# the first loop is for pg_stat_statement extension version <= 1.6
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
if echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
fi; \
done; \
# the second loop is for pg_stat_statement extension versions >= 1.7,
# where pg_stat_statement_reset() got 3 additional arguments
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
if ! echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
fi; \
done
#########################################################################################
#
# Layer "neon-pg-ext-build"
# compile neon extensions
#
#########################################################################################
FROM build-deps AS neon-pg-ext-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_test_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_rmgr \
-s install && \
case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/hnsw \
-s install
#########################################################################################
#
# Compile and run the Neon-specific `compute_ctl` binary
#
#########################################################################################
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#
# Clean up postgres folder before inclusion
#
#########################################################################################
FROM neon-pg-ext-build AS postgres-cleanup-layer
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
RUN cd /usr/local/pgsql/bin && rm ecpg
# Remove headers that we won't need anymore - we've completed installation of all extensions
RUN rm -r /usr/local/pgsql/include
# Remove static postgresql libraries - all compilation is finished, so we
# can now remove these files - they must be included in other binaries by now
# if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Final layer
# Put it all together into the final image
#
#########################################################################################
FROM debian:bullseye-slim
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local/pgsql

View File

@@ -51,8 +51,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
# Force cargo not to print progress bar
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
#
# Top level Makefile to build Neon and PostgreSQL
@@ -159,8 +157,8 @@ neon-pg-ext-%: postgres-%
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
.PHONY: neon-pg-clean-ext-%
neon-pg-clean-ext-%:
.PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
@@ -176,10 +174,10 @@ neon-pg-clean-ext-%:
# Build walproposer as a static library. walproposer source code is located
# in the pgxn/neon directory.
#
#
# We also need to include libpgport.a and libpgcommon.a, because walproposer
# uses some functions from those libraries.
#
#
# Some object files are removed from libpgport.a and libpgcommon.a because
# they depend on openssl and other libraries that are not included in our
# Rust build.
@@ -216,11 +214,11 @@ neon-pg-ext: \
neon-pg-ext-v15 \
neon-pg-ext-v16
.PHONY: neon-pg-clean-ext
neon-pg-clean-ext: \
neon-pg-clean-ext-v14 \
neon-pg-clean-ext-v15 \
neon-pg-clean-ext-v16
.PHONY: neon-pg-ext-clean
neon-pg-ext-clean: \
neon-pg-ext-clean-v14 \
neon-pg-ext-clean-v15 \
neon-pg-ext-clean-v16
# shorthand to build all Postgres versions
.PHONY: postgres
@@ -249,7 +247,7 @@ postgres-check: \
# This doesn't remove the effects of 'configure'.
.PHONY: clean
clean: postgres-clean neon-pg-clean-ext
clean: postgres-clean neon-pg-ext-clean
$(CARGO_CMD_PREFIX) cargo clean
# This removes everything

2
NOTICE
View File

@@ -1,5 +1,5 @@
Neon
Copyright 2022 - 2024 Neon Inc.
Copyright 2022 Neon Inc.
The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
See vendor/postgres-vX/COPYRIGHT for details.

View File

@@ -14,8 +14,8 @@ Alternatively, compile and run the project [locally](#running-local-installation
A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
The Neon storage engine consists of two major components:
- Pageserver: Scalable storage backend for the compute nodes.
- Safekeepers: The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
- Pageserver. Scalable storage backend for the compute nodes.
- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.
@@ -81,9 +81,9 @@ The project uses [rust toolchain file](./rust-toolchain.toml) to define the vers
This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file.
rustup users who want to build with another toolchain can use the [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify that their toolchain matches the version in the file.
non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file.
Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates.
#### Building on Linux
@@ -124,7 +124,7 @@ make -j`sysctl -n hw.logicalcpu` -s
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
To run the integration tests or Python scripts (not required to use the code), install
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
#### Running neon database
@@ -166,7 +166,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
2. Now, it is possible to connect to postgres and run some queries:
```text
> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
postgres=# CREATE TABLE t(key int primary key, value text);
CREATE TABLE
postgres=# insert into t values(1,1);
@@ -205,7 +205,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
# this new postgres instance will have all the data from 'main' postgres,
# but all modifications would not affect data in original postgres
> psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres
> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
postgres=# select * from t;
key | value
-----+-------
@@ -216,7 +216,7 @@ postgres=# insert into t values(2,2);
INSERT 0 1
# check that the new change doesn't affect the 'main' postgres
> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
postgres=# select * from t;
key | value
-----+-------
@@ -224,7 +224,7 @@ postgres=# select * from t;
(1 row)
```
4. If you want to run tests afterwards (see below), you must stop all the running pageserver, safekeeper, and postgres instances
4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
you have just started. You can terminate them all with one command:
```sh
> cargo neon stop
@@ -243,22 +243,12 @@ CARGO_BUILD_FLAGS="--features=testing" make
```
By default, this runs both debug and release modes, and all supported postgres versions. When
testing locally, it is convenient to run just one set of permutations, like this:
testing locally, it is convenient to run just run one set of permutations, like this:
```sh
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
```
## Flamegraphs
You may find yourself in need of flamegraphs for software in this repository.
You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice!
>[!IMPORTANT]
> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument.
> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
## Documentation
[docs](/docs) Contains a top-level overview of all available markdown documentation.

View File

@@ -2,7 +2,7 @@ use std::collections::HashMap;
use std::env;
use std::fs;
use std::io::BufRead;
use std::os::unix::fs::{symlink, PermissionsExt};
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::{Command, Stdio};
use std::str::FromStr;
@@ -207,7 +207,6 @@ fn maybe_cgexec(cmd: &str) -> Command {
/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
/// that we give to customers
#[instrument(skip_all)]
fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
let roles = spec
.cluster
@@ -320,12 +319,11 @@ impl ComputeNode {
// Get basebackup from the libpq connection to pageserver using `connstr` and
// unarchive it to `pgdata` directory overriding all its previous content.
#[instrument(skip_all, fields(%lsn))]
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let start_time = Instant::now();
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let mut config = postgres::Config::from_str(shard0_connstr)?;
let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
// Use the storage auth token from the config file, if given.
// Note: this overrides any password set in the connection string.
@@ -392,34 +390,6 @@ impl ComputeNode {
Ok(())
}
// Gets the basebackup in a retry loop
#[instrument(skip_all, fields(%lsn))]
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let mut retry_period_ms = 500;
let mut attempts = 0;
let max_attempts = 5;
loop {
let result = self.try_get_basebackup(compute_state, lsn);
match result {
Ok(_) => {
return result;
}
Err(ref e) if attempts < max_attempts => {
warn!(
"Failed to get basebackup: {} (attempt {}/{})",
e, attempts, max_attempts
);
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
retry_period_ms *= 2;
}
Err(_) => {
return result;
}
}
attempts += 1;
}
}
pub async fn check_safekeepers_synced_async(
&self,
compute_state: &ComputeState,
@@ -635,48 +605,6 @@ impl ComputeNode {
// Update pg_hba.conf received with basebackup.
update_pg_hba(pgdata_path)?;
// Place pg_dynshmem under /dev/shm. This allows us to use
// 'dynamic_shared_memory_type = mmap' so that the files are placed in
// /dev/shm, similar to how 'dynamic_shared_memory_type = posix' works.
//
// Why on earth don't we just stick to the 'posix' default, you might
// ask. It turns out that making large allocations with 'posix' doesn't
// work very well with autoscaling. The behavior we want is that:
//
// 1. You can make large DSM allocations, larger than the current RAM
// size of the VM, without errors
//
// 2. If the allocated memory is really used, the VM is scaled up
// automatically to accommodate that
//
// We try to make that possible by having swap in the VM. But with the
// default 'posix' DSM implementation, we fail step 1, even when there's
// plenty of swap available. PostgreSQL uses posix_fallocate() to create
// the shmem segment, which is really just a file in /dev/shm in Linux,
// but posix_fallocate() on tmpfs returns ENOMEM if the size is larger
// than available RAM.
//
// Using 'dynamic_shared_memory_type = mmap' works around that, because
// the Postgres 'mmap' DSM implementation doesn't use
// posix_fallocate(). Instead, it uses repeated calls to write(2) to
// fill the file with zeros. It's weird that that differs between
// 'posix' and 'mmap', but we take advantage of it. When the file is
// filled slowly with write(2), the kernel allows it to grow larger, as
// long as there's swap available.
//
// In short, using 'dynamic_shared_memory_type = mmap' allows us one DSM
// segment to be larger than currently available RAM. But because we
// don't want to store it on a real file, which the kernel would try to
// flush to disk, so symlink pg_dynshm to /dev/shm.
//
// We don't set 'dynamic_shared_memory_type = mmap' here, we let the
// control plane control that option. If 'mmap' is not used, this
// symlink doesn't affect anything.
//
// See https://github.com/neondatabase/autoscaling/issues/800
std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?;
symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?;
match spec.mode {
ComputeMode::Primary => {}
ComputeMode::Replica | ComputeMode::Static(..) => {
@@ -772,14 +700,13 @@ impl ComputeNode {
// In this case we need to connect with old `zenith_admin` name
// and create new user. We cannot simply rename connected user,
// but we can create a new one and grant it all privileges.
let connstr = self.connstr.clone();
let mut client = match Client::connect(connstr.as_str(), NoTls) {
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
Err(e) => {
info!(
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
e
);
let mut zenith_admin_connstr = connstr.clone();
let mut zenith_admin_connstr = self.connstr.clone();
zenith_admin_connstr
.set_username("zenith_admin")
@@ -792,8 +719,8 @@ impl ComputeNode {
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
drop(client);
// reconnect with connstring with expected name
Client::connect(connstr.as_str(), NoTls)?
// reconnect with connsting with expected name
Client::connect(self.connstr.as_str(), NoTls)?
}
Ok(client) => client,
};
@@ -807,13 +734,8 @@ impl ComputeNode {
cleanup_instance(&mut client)?;
handle_roles(spec, &mut client)?;
handle_databases(spec, &mut client)?;
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
handle_grants(
spec,
&mut client,
connstr.as_str(),
self.has_feature(ComputeFeature::AnonExtension),
)?;
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
handle_grants(spec, &mut client, self.connstr.as_str())?;
handle_extensions(spec, &mut client)?;
handle_extension_neon(&mut client)?;
create_availability_check_data(&mut client)?;
@@ -821,11 +743,6 @@ impl ComputeNode {
// 'Close' connection
drop(client);
// Run migrations separately to not hold up cold starts
thread::spawn(move || {
let mut client = Client::connect(connstr.as_str(), NoTls)?;
handle_migrations(&mut client)
});
Ok(())
}
@@ -887,18 +804,9 @@ impl ComputeNode {
handle_roles(&spec, &mut client)?;
handle_databases(&spec, &mut client)?;
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
handle_grants(
&spec,
&mut client,
self.connstr.as_str(),
self.has_feature(ComputeFeature::AnonExtension),
)?;
handle_grants(&spec, &mut client, self.connstr.as_str())?;
handle_extensions(&spec, &mut client)?;
handle_extension_neon(&mut client)?;
// We can skip handle_migrations here because a new migration can only appear
// if we have a new version of the compute_ctl binary, which can only happen
// if compute got restarted, in which case we'll end up inside of apply_config
// instead of reconfigure.
}
// 'Close' connection

View File

@@ -51,9 +51,6 @@ pub fn write_postgres_conf(
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if !spec.safekeeper_connstrings.is_empty() {
writeln!(
file,

View File

@@ -138,34 +138,6 @@ fn watch_compute_activity(compute: &ComputeNode) {
}
}
//
// Don't suspend compute if there is an active logical replication subscription
//
// `where pid is not null` to filter out read only computes and subscription on branches
//
let logical_subscriptions_query =
"select count(*) from pg_stat_subscription where pid is not null;";
match cli.query_one(logical_subscriptions_query, &[]) {
Ok(row) => match row.try_get::<&str, i64>("count") {
Ok(num_subscribers) => {
if num_subscribers > 0 {
compute.update_last_active(Some(Utc::now()));
continue;
}
}
Err(e) => {
warn!("failed to parse `pg_stat_subscription` count: {:?}", e);
continue;
}
},
Err(e) => {
warn!(
"failed to get list of active logical replication subscriptions: {:?}",
e
);
continue;
}
}
//
// Do not suspend compute if autovacuum is running
//
let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";

View File

@@ -264,10 +264,9 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
// case we miss some events for some reason. Not strictly necessary, but
// better safe than sorry.
let (tx, rx) = std::sync::mpsc::channel();
let watcher_res = notify::recommended_watcher(move |res| {
let (mut watcher, rx): (Box<dyn Watcher>, _) = match notify::recommended_watcher(move |res| {
let _ = tx.send(res);
});
let (mut watcher, rx): (Box<dyn Watcher>, _) = match watcher_res {
}) {
Ok(watcher) => (Box::new(watcher), rx),
Err(e) => {
match e.kind {

View File

@@ -581,12 +581,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
#[instrument(skip_all)]
pub fn handle_grants(
spec: &ComputeSpec,
client: &mut Client,
connstr: &str,
enable_anon_extension: bool,
) -> Result<()> {
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
info!("modifying database permissions");
let existing_dbs = get_existing_dbs(client)?;
@@ -683,11 +678,6 @@ pub fn handle_grants(
inlinify(&grant_query)
);
db_client.simple_query(&grant_query)?;
// it is important to run this after all grants
if enable_anon_extension {
handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
}
}
Ok(())
@@ -737,208 +727,3 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
Ok(())
}
#[instrument(skip_all)]
pub fn handle_migrations(client: &mut Client) -> Result<()> {
info!("handle migrations");
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
let migrations = [
"ALTER ROLE neon_superuser BYPASSRLS",
r#"
DO $$
DECLARE
role_name text;
BEGIN
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
END LOOP;
FOR role_name IN SELECT rolname FROM pg_roles
WHERE
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
END LOOP;
END $$;
"#,
r#"
DO $$
BEGIN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
END IF;
END
$$;"#,
"GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
];
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
client.simple_query(query)?;
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
client.simple_query(query)?;
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
client.simple_query(query)?;
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
client.simple_query(query)?;
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
client.simple_query(query)?;
query = "SELECT id FROM neon_migration.migration_id";
let row = client.query_one(query, &[])?;
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
let starting_migration_id = current_migration;
query = "BEGIN";
client.simple_query(query)?;
while current_migration < migrations.len() {
info!("Running migration:\n{}\n", migrations[current_migration]);
client.simple_query(migrations[current_migration])?;
current_migration += 1;
}
let setval = format!(
"UPDATE neon_migration.migration_id SET id={}",
migrations.len()
);
client.simple_query(&setval)?;
query = "COMMIT";
client.simple_query(query)?;
info!(
"Ran {} migrations",
(migrations.len() - starting_migration_id)
);
Ok(())
}
/// Connect to the database as superuser and pre-create anon extension
/// if it is present in shared_preload_libraries
#[instrument(skip_all)]
pub fn handle_extension_anon(
spec: &ComputeSpec,
db_owner: &str,
db_client: &mut Client,
grants_only: bool,
) -> Result<()> {
info!("handle extension anon");
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
if libs.contains("anon") {
if !grants_only {
// check if extension is already initialized using anon.is_initialized()
let query = "SELECT anon.is_initialized()";
match db_client.query(query, &[]) {
Ok(rows) => {
if !rows.is_empty() {
let is_initialized: bool = rows[0].get(0);
if is_initialized {
info!("anon extension is already initialized");
return Ok(());
}
}
}
Err(e) => {
warn!(
"anon extension is_installed check failed with expected error: {}",
e
);
}
};
// Create anon extension if this compute needs it
// Users cannot create it themselves, because superuser is required.
let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
info!("creating anon extension with query: {}", query);
match db_client.query(query, &[]) {
Ok(_) => {}
Err(e) => {
error!("anon extension creation failed with error: {}", e);
return Ok(());
}
}
// check that extension is installed
query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
let rows = db_client.query(query, &[])?;
if rows.is_empty() {
error!("anon extension is not installed");
return Ok(());
}
// Initialize anon extension
// This also requires superuser privileges, so users cannot do it themselves.
query = "SELECT anon.init()";
match db_client.query(query, &[]) {
Ok(_) => {}
Err(e) => {
error!("anon.init() failed with error: {}", e);
return Ok(());
}
}
}
// check that extension is installed, if not bail early
let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
match db_client.query(query, &[]) {
Ok(rows) => {
if rows.is_empty() {
error!("anon extension is not installed");
return Ok(());
}
}
Err(e) => {
error!("anon extension check failed with error: {}", e);
return Ok(());
}
};
let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query)?;
// Grant permissions to db_owner to use anon extension functions
let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query)?;
// This is needed, because some functions are defined as SECURITY DEFINER.
// In Postgres SECURITY DEFINER functions are executed with the privileges
// of the owner.
// In anon extension this it is needed to access some GUCs, which are only accessible to
// superuser. But we've patched postgres to allow db_owner to access them as well.
// So we need to change owner of these functions to db_owner.
let query = format!("
SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};'
from pg_proc p
join pg_namespace nsp ON p.pronamespace = nsp.oid
where nsp.nspname = 'anon';", db_owner);
info!("change anon extension functions owner to db owner");
db_client.simple_query(&query)?;
// affects views as well
let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query)?;
let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
info!("granting anon extension permissions with query: {}", query);
db_client.simple_query(&query)?;
}
}
Ok(())
}

View File

@@ -19,7 +19,6 @@ hex.workspace = true
hyper.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["blocking", "json"] }
scopeguard.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_with.workspace = true

View File

@@ -4,25 +4,16 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
# Enables test-only APIs and behaviors
testing = []
[dependencies]
anyhow.workspace = true
aws-config.workspace = true
aws-sdk-secretsmanager.workspace = true
camino.workspace = true
clap.workspace = true
futures.workspace = true
git-version.workspace = true
hyper.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
postgres_connection.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json.workspace = true
thiserror.workspace = true
@@ -30,9 +21,9 @@ tokio.workspace = true
tokio-util.workspace = true
tracing.workspace = true
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
diesel_migrations = { version = "2.1.0" }
r2d2 = { version = "0.8.10" }
# TODO: remove this after DB persistence is added, it is only used for
# a parsing function when loading pageservers from neon_local LocalEnv
postgres_backend.workspace = true
utils = { path = "../../libs/utils/" }
metrics = { path = "../../libs/metrics/" }

View File

@@ -1,6 +0,0 @@
-- This file was automatically created by Diesel to setup helper functions
-- and other internal bookkeeping. This file is safe to edit, any future
-- changes will be added to existing projects as new migrations.
DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
DROP FUNCTION IF EXISTS diesel_set_updated_at();

View File

@@ -1,36 +0,0 @@
-- This file was automatically created by Diesel to setup helper functions
-- and other internal bookkeeping. This file is safe to edit, any future
-- changes will be added to existing projects as new migrations.
-- Sets up a trigger for the given table to automatically set a column called
-- `updated_at` whenever the row is modified (unless `updated_at` was included
-- in the modified columns)
--
-- # Example
--
-- ```sql
-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
--
-- SELECT diesel_manage_updated_at('users');
-- ```
CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
BEGIN
EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
BEGIN
IF (
NEW IS DISTINCT FROM OLD AND
NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
) THEN
NEW.updated_at := current_timestamp;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;

View File

@@ -1,13 +0,0 @@
CREATE TABLE tenant_shards (
tenant_id VARCHAR NOT NULL,
shard_number INTEGER NOT NULL,
shard_count INTEGER NOT NULL,
PRIMARY KEY(tenant_id, shard_number, shard_count),
shard_stripe_size INTEGER NOT NULL,
generation INTEGER NOT NULL,
generation_pageserver BIGINT NOT NULL,
placement_policy VARCHAR NOT NULL,
splitting SMALLINT NOT NULL,
-- config is JSON encoded, opaque to the database.
config TEXT NOT NULL
);

View File

@@ -1,10 +0,0 @@
CREATE TABLE nodes (
node_id BIGINT PRIMARY KEY NOT NULL,
scheduling_policy VARCHAR NOT NULL,
listen_http_addr VARCHAR NOT NULL,
listen_http_port INTEGER NOT NULL,
listen_pg_addr VARCHAR NOT NULL,
listen_pg_port INTEGER NOT NULL
);

View File

@@ -1,104 +1,72 @@
use std::{collections::HashMap, time::Duration};
use std::collections::HashMap;
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
use control_plane::endpoint::ComputeControlPlane;
use control_plane::local_env::LocalEnv;
use hyper::{Method, StatusCode};
use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId};
use postgres_connection::parse_host_port;
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use utils::{
backoff::{self},
id::{NodeId, TenantId},
};
use crate::service::Config;
const BUSY_DELAY: Duration = Duration::from_secs(1);
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
pub(crate) const API_CONCURRENCY: usize = 32;
use utils::id::{NodeId, TenantId};
pub(super) struct ComputeHookTenant {
shards: Vec<(ShardIndex, NodeId)>,
}
#[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequestShard {
node_id: NodeId,
shard_number: ShardNumber,
}
/// Request body that we send to the control plane to notify it of where a tenant is attached
#[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequest {
tenant_id: TenantId,
shards: Vec<ComputeHookNotifyRequestShard>,
}
/// Error type for attempts to call into the control plane compute notification hook
#[derive(thiserror::Error, Debug)]
pub(crate) enum NotifyError {
// Request was not send successfully, e.g. transport error
#[error("Sending request: {0}")]
Request(#[from] reqwest::Error),
// Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon.
#[error("Control plane tenant busy")]
Busy,
// Explicit 429 response asking us to retry less frequently
#[error("Control plane overloaded")]
SlowDown,
// A 503 response indicates the control plane can't handle the request right now
#[error("Control plane unavailable (status {0})")]
Unavailable(StatusCode),
// API returned unexpected non-success status. We will retry, but log a warning.
#[error("Control plane returned unexpected status {0}")]
Unexpected(StatusCode),
// We shutdown while sending
#[error("Shutting down")]
ShuttingDown,
// A response indicates we will never succeed, such as 400 or 404
#[error("Non-retryable error {0}")]
Fatal(StatusCode),
}
impl ComputeHookTenant {
async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> {
// Find the highest shard count and drop any shards that aren't
// for that shard count.
let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
let Some(shard_count) = shard_count else {
// No shards, nothing to do.
tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
return None;
return Ok(());
};
self.shards.retain(|(k, _v)| k.shard_count == shard_count);
self.shards
.sort_by_key(|(shard, _node_id)| shard.shard_number);
if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
// We have pageservers for all the shards: emit a configuration update
return Some(ComputeHookNotifyRequest {
tenant_id,
shards: self
.shards
.iter()
.map(|(shard, node_id)| ComputeHookNotifyRequestShard {
shard_number: shard.shard_number,
node_id: *node_id,
})
.collect(),
});
if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
// We have pageservers for all the shards: proceed to reconfigure compute
let env = match LocalEnv::load_config() {
Ok(e) => e,
Err(e) => {
tracing::warn!(
"Couldn't load neon_local config, skipping compute update ({e})"
);
return Ok(());
}
};
let cplane = ComputeControlPlane::load(env.clone())
.expect("Error loading compute control plane");
let compute_pageservers = self
.shards
.iter()
.map(|(_shard, node_id)| {
let ps_conf = env
.get_pageserver_conf(*node_id)
.expect("Unknown pageserver");
let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
.expect("Unable to parse listen_pg_addr");
(pg_host, pg_port.unwrap_or(5432))
})
.collect::<Vec<_>>();
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == tenant_id && endpoint.status() == "running" {
tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
endpoint.reconfigure(compute_pageservers.clone()).await?;
}
}
} else {
tracing::info!(
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
self.shards.len(),
shard_count.count()
shard_count.0
);
}
None
Ok(())
}
}
@@ -106,173 +74,22 @@ impl ComputeHookTenant {
/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures
/// the compute connection string.
pub(super) struct ComputeHook {
config: Config,
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
authorization_header: Option<String>,
}
impl ComputeHook {
pub(super) fn new(config: Config) -> Self {
let authorization_header = config
.control_plane_jwt_token
.clone()
.map(|jwt| format!("Bearer {}", jwt));
pub(super) fn new() -> Self {
Self {
state: Default::default(),
config,
authorization_header,
}
}
/// For test environments: use neon_local's LocalEnv to update compute
async fn do_notify_local(
&self,
reconfigure_request: ComputeHookNotifyRequest,
) -> anyhow::Result<()> {
let env = match LocalEnv::load_config() {
Ok(e) => e,
Err(e) => {
tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})");
return Ok(());
}
};
let cplane =
ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request;
let compute_pageservers = shards
.into_iter()
.map(|shard| {
let ps_conf = env
.get_pageserver_conf(shard.node_id)
.expect("Unknown pageserver");
let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
.expect("Unable to parse listen_pg_addr");
(pg_host, pg_port.unwrap_or(5432))
})
.collect::<Vec<_>>();
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
endpoint.reconfigure(compute_pageservers.clone()).await?;
}
}
Ok(())
}
async fn do_notify_iteration(
&self,
client: &reqwest::Client,
url: &String,
reconfigure_request: &ComputeHookNotifyRequest,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let req = client.request(Method::PUT, url);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
};
tracing::info!(
"Sending notify request to {} ({:?})",
url,
reconfigure_request
);
let send_result = req.json(&reconfigure_request).send().await;
let response = match send_result {
Ok(r) => r,
Err(e) => return Err(e.into()),
};
// Treat all 2xx responses as success
if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES {
if response.status() != StatusCode::OK {
// Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so
// log a warning.
tracing::warn!(
"Unexpected 2xx response code {} from control plane",
response.status()
);
}
return Ok(());
}
// Error response codes
match response.status() {
StatusCode::TOO_MANY_REQUESTS => {
// TODO: 429 handling should be global: set some state visible to other requests
// so that they will delay before starting, rather than all notifications trying
// once before backing off.
tokio::time::timeout(SLOWDOWN_DELAY, cancel.cancelled())
.await
.ok();
Err(NotifyError::SlowDown)
}
StatusCode::LOCKED => {
// Delay our retry if busy: the usual fast exponential backoff in backoff::retry
// is not appropriate
tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
.await
.ok();
Err(NotifyError::Busy)
}
StatusCode::SERVICE_UNAVAILABLE
| StatusCode::GATEWAY_TIMEOUT
| StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())),
StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
Err(NotifyError::Fatal(response.status()))
}
_ => Err(NotifyError::Unexpected(response.status())),
}
}
async fn do_notify(
&self,
url: &String,
reconfigure_request: ComputeHookNotifyRequest,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let client = reqwest::Client::new();
backoff::retry(
|| self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
3,
10,
"Send compute notification",
cancel,
)
.await
.ok_or_else(|| NotifyError::ShuttingDown)
.and_then(|x| x)
}
/// Call this to notify the compute (postgres) tier of new pageservers to use
/// for a tenant. notify() is called by each shard individually, and this function
/// will decide whether an update to the tenant is sent. An update is sent on the
/// condition that:
/// - We know a pageserver for every shard.
/// - All the shards have the same shard_count (i.e. we are not mid-split)
///
/// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
/// that is cancelled.
///
/// This function is fallible, including in the case that the control plane is transiently
/// unavailable. A limited number of retries are done internally to efficiently hide short unavailability
/// periods, but we don't retry forever. The **caller** is responsible for handling failures and
/// ensuring that they eventually call again to ensure that the compute is eventually notified of
/// the proper pageserver nodes for a tenant.
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
pub(super) async fn notify(
&self,
tenant_shard_id: TenantShardId,
node_id: NodeId,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
) -> anyhow::Result<()> {
tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id);
let mut locked = self.state.lock().await;
let entry = locked
.entry(tenant_shard_id.tenant_id)
@@ -294,25 +111,6 @@ impl ComputeHook {
entry.shards.push((shard_index, node_id));
}
let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await;
let Some(reconfigure_request) = reconfigure_request else {
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
// until it does.
tracing::info!("Tenant isn't yet ready to emit a notification");
return Ok(());
};
if let Some(notify_url) = &self.config.compute_hook_url {
self.do_notify(notify_url, reconfigure_request, cancel)
.await
} else {
self.do_notify_local(reconfigure_request)
.await
.map_err(|e| {
// This path is for testing only, so munge the error into our prod-style error type.
tracing::error!("Local notification hook failed: {e}");
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
}
entry.maybe_reconfigure(tenant_shard_id.tenant_id).await
}
}

View File

@@ -1,19 +1,14 @@
use crate::reconciler::ReconcileError;
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
use crate::service::Service;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use pageserver_api::models::{
TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TimelineCreateRequest,
};
use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use std::sync::Arc;
use std::time::{Duration, Instant};
use utils::auth::SwappableJwtAuth;
use utils::http::endpoint::{auth_middleware, request_span};
use utils::http::request::parse_request_param;
use utils::id::{TenantId, TimelineId};
use utils::id::TenantId;
use utils::{
http::{
@@ -42,7 +37,7 @@ pub struct HttpState {
impl HttpState {
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
let allowlist_routes = ["/status", "/ready", "/metrics"]
let allowlist_routes = ["/status"]
.iter()
.map(|v| v.parse().unwrap())
.collect::<Vec<_>>();
@@ -109,163 +104,34 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
json_response(StatusCode::OK, state.service.inspect(inspect_req))
}
async fn handle_tenant_create(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
}
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. This avoids
// needing to track a "deleting" state for tenants.
async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
where
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
{
let started_at = Instant::now();
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
// completed.
let mut retry_period = Duration::from_secs(1);
// On subsequent retries, wait longer.
let max_retry_period = Duration::from_secs(5);
// Enable callers with a 30 second request timeout to reliably get a response
let max_wait = Duration::from_secs(25);
loop {
let status = f(service.clone()).await?;
match status {
StatusCode::ACCEPTED => {
tracing::info!("Deletion accepted, waiting to try again...");
tokio::time::sleep(retry_period).await;
retry_period = max_retry_period;
}
StatusCode::NOT_FOUND => {
tracing::info!("Deletion complete");
return json_response(StatusCode::OK, ());
}
_ => {
tracing::warn!("Unexpected status {status}");
return json_response(status, ());
}
}
let now = Instant::now();
if now + retry_period > started_at + max_wait {
tracing::info!("Deletion timed out waiting for 404");
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
// the pageserver's swagger definition for this endpoint, and has the same desired
// effect of causing the control plane to retry later.
return json_response(StatusCode::CONFLICT, ());
}
}
}
async fn handle_tenant_location_config(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
service
.tenant_location_config(tenant_id, config_req)
.await?,
state.service.tenant_create(create_req).await?,
)
}
async fn handle_tenant_delete(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
deletion_wrapper(service, move |service| async move {
service.tenant_delete(tenant_id).await
})
.await
}
async fn handle_tenant_timeline_create(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
service
state
.service
.tenant_timeline_create(tenant_id, create_req)
.await?,
)
}
async fn handle_tenant_timeline_delete(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
let state = get_state(&req);
deletion_wrapper(service, move |service| async move {
service.tenant_timeline_delete(tenant_id, timeline_id).await
})
.await
}
async fn handle_tenant_timeline_passthrough(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let Some(path) = req.uri().path_and_query() else {
// This should never happen, our request router only calls us if there is a path
return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
};
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
// Find the node that holds shard zero
let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
// Callers will always pass an unsharded tenant ID. Before proxying, we must
// rewrite this to a shard-aware shard zero ID.
let path = format!("{}", path);
let tenant_str = tenant_id.to_string();
let tenant_shard_str = format!("{}", tenant_shard_id);
let path = path.replace(&tenant_str, &tenant_shard_str);
let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
let resp = client.get_raw(path).await.map_err(|_e|
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
// if we can't successfully send a request to the pageserver, we aren't available.
ApiError::ShuttingDown)?;
// We have a reqest::Response, would like a http::Response
let mut builder = hyper::Response::builder()
.status(resp.status())
.version(resp.version());
for (k, v) in resp.headers() {
builder = builder.header(k, v);
}
let response = builder
.body(Body::wrap_stream(resp.bytes_stream()))
.map_err(|e| ApiError::InternalServerError(e.into()))?;
Ok(response)
}
async fn handle_tenant_locate(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -275,17 +141,6 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
json_response(StatusCode::OK, ())
}
async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let state = get_state(&req);
json_response(StatusCode::OK, state.service.node_list().await?)
}
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
}
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let node_id: NodeId = parse_request_param(&req, "node_id")?;
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
@@ -299,107 +154,30 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
}
async fn handle_tenant_shard_split(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
json_response(
StatusCode::OK,
service.tenant_shard_split(tenant_id, split_req).await?,
)
}
async fn handle_tenant_shard_migrate(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
service
state
.service
.tenant_shard_migrate(tenant_shard_id, migrate_req)
.await?,
)
}
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
}
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let state = get_state(&req);
state.service.tenants_dump()
}
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let state = get_state(&req);
state.service.scheduler_dump()
}
async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let state = get_state(&req);
json_response(StatusCode::OK, state.service.consistency_check().await?)
}
/// Status endpoint is just used for checking that our HTTP listener is up
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, ())
}
/// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
/// with remote pageserver nodes). This is intended for use as a kubernetes readiness probe.
async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let state = get_state(&req);
if state.service.startup_complete.is_ready() {
json_response(StatusCode::OK, ())
} else {
json_response(StatusCode::SERVICE_UNAVAILABLE, ())
}
}
impl From<ReconcileError> for ApiError {
fn from(value: ReconcileError) -> Self {
ApiError::Conflict(format!("Reconciliation error: {}", value))
}
}
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
/// be allowed to run if Service has finished its initial reconciliation.
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
where
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
{
let state = get_state(&request);
let service = state.service.clone();
let startup_complete = service.startup_complete.clone();
if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
.await
.is_err()
{
// This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
// timeouts around its remote calls, to bound its runtime.
return Err(ApiError::Timeout(
"Timed out waiting for service readiness".into(),
));
}
request_span(
request,
|request| async move { handler(service, request).await },
)
.await
}
pub fn make_router(
service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>,
@@ -418,76 +196,23 @@ pub fn make_router(
router
.data(Arc::new(HttpState::new(service, auth)))
// Non-prefixed generic endpoints (status, metrics)
.get("/status", |r| request_span(r, handle_status))
.get("/ready", |r| request_span(r, handle_ready))
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
.post("/upcall/v1/re-attach", |r| {
request_span(r, handle_re_attach)
})
.post("/upcall/v1/validate", |r| request_span(r, handle_validate))
// Test/dev/debug endpoints
.post("/debug/v1/attach-hook", |r| {
request_span(r, handle_attach_hook)
})
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
.post("/debug/v1/tenant/:tenant_id/drop", |r| {
request_span(r, handle_tenant_drop)
})
.post("/debug/v1/node/:node_id/drop", |r| {
request_span(r, handle_node_drop)
})
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
.get("/debug/v1/scheduler", |r| {
request_span(r, handle_scheduler_dump)
})
.post("/debug/v1/consistency_check", |r| {
request_span(r, handle_consistency_check)
})
.get("/control/v1/tenant/:tenant_id/locate", |r| {
tenant_service_handler(r, handle_tenant_locate)
})
// Node operations
.post("/control/v1/node", |r| {
request_span(r, handle_node_register)
})
.get("/control/v1/node", |r| request_span(r, handle_node_list))
.put("/control/v1/node/:node_id/config", |r| {
.post("/re-attach", |r| request_span(r, handle_re_attach))
.post("/validate", |r| request_span(r, handle_validate))
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
.post("/inspect", |r| request_span(r, handle_inspect))
.post("/node", |r| request_span(r, handle_node_register))
.put("/node/:node_id/config", |r| {
request_span(r, handle_node_configure)
})
// Tenant Shard operations
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
tenant_service_handler(r, handle_tenant_shard_migrate)
.post("/tenant", |r| request_span(r, handle_tenant_create))
.post("/tenant/:tenant_id/timeline", |r| {
request_span(r, handle_tenant_timeline_create)
})
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
tenant_service_handler(r, handle_tenant_shard_split)
.get("/tenant/:tenant_id/locate", |r| {
request_span(r, handle_tenant_locate)
})
// Tenant operations
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
.post("/v1/tenant", |r| {
tenant_service_handler(r, handle_tenant_create)
})
.delete("/v1/tenant/:tenant_id", |r| {
tenant_service_handler(r, handle_tenant_delete)
})
.put("/v1/tenant/:tenant_id/location_config", |r| {
tenant_service_handler(r, handle_tenant_location_config)
})
// Timeline operations
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
tenant_service_handler(r, handle_tenant_timeline_delete)
})
.post("/v1/tenant/:tenant_id/timeline", |r| {
tenant_service_handler(r, handle_tenant_timeline_create)
})
// Tenant detail GET passthrough to shard zero
.get("/v1/tenant/:tenant_id", |r| {
tenant_service_handler(r, handle_tenant_timeline_passthrough)
})
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
// timeline GET APIs will be implicitly included.
.get("/v1/tenant/:tenant_id/timeline*", |r| {
tenant_service_handler(r, handle_tenant_timeline_passthrough)
.put("/tenant/:tenant_shard_id/migrate", |r| {
request_span(r, handle_tenant_shard_migrate)
})
}

View File

@@ -3,27 +3,23 @@ use utils::seqwait::MonotonicCounter;
mod compute_hook;
pub mod http;
pub mod metrics;
mod node;
pub mod persistence;
mod reconciler;
mod scheduler;
mod schema;
pub mod service;
mod tenant_state;
#[derive(Clone, Serialize, Deserialize, Debug)]
#[derive(Clone, Serialize, Deserialize)]
enum PlacementPolicy {
/// Cheapest way to attach a tenant: just one pageserver, no secondary
Single,
/// Production-ready way to attach a tenant: one attached pageserver and
/// some number of secondaries.
Double(usize),
/// Do not attach to any pageservers
Detached,
}
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
struct Sequence(u64);
impl Sequence {
@@ -38,12 +34,6 @@ impl std::fmt::Display for Sequence {
}
}
impl std::fmt::Debug for Sequence {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl MonotonicCounter<Sequence> for Sequence {
fn cnt_advance(&mut self, v: Sequence) {
assert!(*self <= v);

View File

@@ -4,30 +4,23 @@
/// This enables running & testing pageservers without a full-blown
/// deployment of the Neon cloud platform.
///
use anyhow::{anyhow, Context};
use anyhow::anyhow;
use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service};
use aws_config::{self, BehaviorVersion, Region};
use camino::Utf8PathBuf;
use clap::Parser;
use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use std::sync::Arc;
use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken;
use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat};
use utils::signals::{ShutdownSignals, Signal};
use utils::{project_build_tag, project_git_version, tcp_listener};
project_git_version!(GIT_VERSION);
project_build_tag!(BUILD_TAG);
use diesel_migrations::{embed_migrations, EmbeddedMigrations};
pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
#[command(arg_required_else_help(true))]
@@ -36,168 +29,21 @@ struct Cli {
#[arg(short, long)]
listen: std::net::SocketAddr,
/// Public key for JWT authentication of clients
/// Path to public key for JWT authentication of clients
#[arg(long)]
public_key: Option<String>,
public_key: Option<camino::Utf8PathBuf>,
/// Token for authenticating this service with the pageservers it controls
#[arg(long)]
#[arg(short, long)]
jwt_token: Option<String>,
/// Token for authenticating this service with the control plane, when calling
/// the compute notification endpoint
#[arg(long)]
control_plane_jwt_token: Option<String>,
/// URL to control plane compute notification endpoint
#[arg(long)]
compute_hook_url: Option<String>,
/// Path to the .json file to store state (will be created if it doesn't exist)
#[arg(short, long)]
path: Option<Utf8PathBuf>,
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
#[arg(long)]
database_url: Option<String>,
path: Utf8PathBuf,
}
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
/// type encapsulates the logic to decide which and do the loading.
struct Secrets {
database_url: String,
public_key: Option<JwtAuth>,
jwt_token: Option<String>,
control_plane_jwt_token: Option<String>,
}
impl Secrets {
const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
"neon-storage-controller-pageserver-jwt-token";
const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
"neon-storage-controller-control-plane-jwt-token";
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
async fn load(args: &Cli) -> anyhow::Result<Self> {
match &args.database_url {
Some(url) => Self::load_cli(url, args),
None => Self::load_aws_sm().await,
}
}
async fn load_aws_sm() -> anyhow::Result<Self> {
let Ok(region) = std::env::var("AWS_REGION") else {
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
};
let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
.region(Region::new(region.clone()))
.load()
.await;
let asm = aws_sdk_secretsmanager::Client::new(&config);
let Some(database_url) = asm
.get_secret_value()
.secret_id(Self::DATABASE_URL_SECRET)
.send()
.await?
.secret_string()
.map(str::to_string)
else {
anyhow::bail!(
"Database URL secret not found at {region}/{}",
Self::DATABASE_URL_SECRET
)
};
let jwt_token = asm
.get_secret_value()
.secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
.send()
.await?
.secret_string()
.map(str::to_string);
if jwt_token.is_none() {
tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
}
let control_plane_jwt_token = asm
.get_secret_value()
.secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
.send()
.await?
.secret_string()
.map(str::to_string);
if jwt_token.is_none() {
tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
}
let public_key = asm
.get_secret_value()
.secret_id(Self::PUBLIC_KEY_SECRET)
.send()
.await?
.secret_string()
.map(str::to_string);
let public_key = match public_key {
Some(key) => Some(JwtAuth::from_key(key)?),
None => {
tracing::warn!(
"No public key set: inccoming HTTP requests will not be authenticated"
);
None
}
};
Ok(Self {
database_url,
public_key,
jwt_token,
control_plane_jwt_token,
})
}
fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
let public_key = match &args.public_key {
None => None,
Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
};
Ok(Self {
database_url: database_url.to_owned(),
public_key,
jwt_token: args.jwt_token.clone(),
control_plane_jwt_token: args.control_plane_jwt_token.clone(),
})
}
}
/// Execute the diesel migrations that are built into this binary
async fn migration_run(database_url: &str) -> anyhow::Result<()> {
use diesel::PgConnection;
use diesel_migrations::{HarnessWithOutput, MigrationHarness};
let mut conn = PgConnection::establish(database_url)?;
HarnessWithOutput::write_to_stdout(&mut conn)
.run_pending_migrations(MIGRATIONS)
.map(|_| ())
.map_err(|e| anyhow::anyhow!(e))?;
Ok(())
}
fn main() -> anyhow::Result<()> {
tokio::runtime::Builder::new_current_thread()
// We use spawn_blocking for database operations, so require approximately
// as many blocking threads as we will open database connections.
.max_blocking_threads(Persistence::MAX_CONNECTIONS as usize)
.enable_all()
.build()
.unwrap()
.block_on(async_main())
}
async fn async_main() -> anyhow::Result<()> {
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
logging::init(
@@ -206,88 +52,49 @@ async fn async_main() -> anyhow::Result<()> {
logging::Output::Stdout,
)?;
preinitialize_metrics();
let args = Cli::parse();
tracing::info!(
"version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
GIT_VERSION,
launch_ts.to_string(),
BUILD_TAG,
args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
args.path,
args.listen
);
let secrets = Secrets::load(&args).await?;
let config = Config {
jwt_token: secrets.jwt_token,
control_plane_jwt_token: secrets.control_plane_jwt_token,
compute_hook_url: args.compute_hook_url,
jwt_token: args.jwt_token,
};
// After loading secrets & config, but before starting anything else, apply database migrations
migration_run(&secrets.database_url)
.await
.context("Running database migrations")?;
let persistence = Arc::new(Persistence::new(&args.path).await);
let json_path = args.path;
let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
let service = Service::spawn(config, persistence.clone()).await?;
let service = Service::spawn(config, persistence).await?;
let http_listener = tcp_listener::bind(args.listen)?;
let auth = secrets
.public_key
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
let router = make_router(service.clone(), auth)
let auth = if let Some(public_key_path) = &args.public_key {
let jwt_auth = JwtAuth::from_key_path(public_key_path)?;
Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
} else {
None
};
let router = make_router(service, auth)
.build()
.map_err(|err| anyhow!(err))?;
let router_service = utils::http::RouterService::new(router).unwrap();
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
// Start HTTP server
let server_shutdown = CancellationToken::new();
let server = hyper::Server::from_tcp(http_listener)?
.serve(router_service)
.with_graceful_shutdown({
let server_shutdown = server_shutdown.clone();
async move {
server_shutdown.cancelled().await;
}
});
tracing::info!("Serving on {0}", args.listen);
let server_task = tokio::task::spawn(server);
// Wait until we receive a signal
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
tokio::select! {
_ = sigint.recv() => {},
_ = sigterm.recv() => {},
_ = sigquit.recv() => {},
}
tracing::info!("Terminating on signal");
tokio::task::spawn(server);
if json_path.is_some() {
// Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
// full postgres dumps around.
if let Err(e) = persistence.write_tenants_json().await {
tracing::error!("Failed to write JSON on shutdown: {e}")
ShutdownSignals::handle(|signal| match signal {
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
tracing::info!("Got {}. Terminating", signal.name());
// We're just a test helper: no graceful shutdown.
std::process::exit(0);
}
}
})?;
// Stop HTTP server first, so that we don't have to service requests
// while shutting down Service
server_shutdown.cancel();
if let Err(e) = server_task.await {
tracing::error!("Error joining HTTP server task: {e}")
}
tracing::info!("Joined HTTP server task");
service.shutdown().await;
tracing::info!("Service shutdown complete");
std::process::exit(0);
Ok(())
}

View File

@@ -1,32 +0,0 @@
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
pub(crate) struct ReconcilerMetrics {
pub(crate) spawned: IntCounter,
pub(crate) complete: IntCounterVec,
}
impl ReconcilerMetrics {
// Labels used on [`Self::complete`]
pub(crate) const SUCCESS: &'static str = "ok";
pub(crate) const ERROR: &'static str = "success";
pub(crate) const CANCEL: &'static str = "cancel";
}
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
spawned: register_int_counter!(
"storage_controller_reconcile_spawn",
"Count of how many times we spawn a reconcile task",
)
.expect("failed to define a metric"),
complete: register_int_counter_vec!(
"storage_controller_reconcile_complete",
"Reconciler tasks completed, broken down by success/failure/cancelled",
&["status"],
)
.expect("failed to define a metric"),
});
pub fn preinitialize_metrics() {
Lazy::force(&RECONCILER);
}

View File

@@ -1,16 +1,7 @@
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use serde::Serialize;
use utils::id::NodeId;
use crate::persistence::NodePersistence;
/// Represents the in-memory description of a Node.
///
/// Scheduling statistics are maintened separately in [`crate::scheduler`].
///
/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
/// implementation of serialization on this type is only for debug dumps.
#[derive(Clone, Serialize, Eq, PartialEq)]
#[derive(Clone)]
pub(crate) struct Node {
pub(crate) id: NodeId,
@@ -43,15 +34,4 @@ impl Node {
NodeSchedulingPolicy::Pause => false,
}
}
pub(crate) fn to_persistent(&self) -> NodePersistence {
NodePersistence {
node_id: self.id.0 as i64,
scheduling_policy: self.scheduling.into(),
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port as i32,
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port as i32,
}
}
}

View File

@@ -1,329 +1,181 @@
pub(crate) mod split_state;
use std::collections::HashMap;
use std::str::FromStr;
use std::time::Duration;
use std::{collections::HashMap, str::FromStr};
use self::split_state::SplitState;
use camino::Utf8Path;
use camino::Utf8PathBuf;
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use diesel::pg::PgConnection;
use diesel::prelude::*;
use diesel::Connection;
use pageserver_api::models::TenantConfig;
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
use camino::{Utf8Path, Utf8PathBuf};
use control_plane::{
attachment_service::{NodeAvailability, NodeSchedulingPolicy},
local_env::LocalEnv,
};
use pageserver_api::{
models::TenantConfig,
shard::{ShardCount, ShardNumber, TenantShardId},
};
use postgres_connection::parse_host_port;
use serde::{Deserialize, Serialize};
use utils::generation::Generation;
use utils::id::{NodeId, TenantId};
use utils::{
generation::Generation,
id::{NodeId, TenantId},
};
use crate::node::Node;
use crate::PlacementPolicy;
use crate::{node::Node, PlacementPolicy};
/// ## What do we store?
///
/// The attachment service does not store most of its state durably.
///
/// The essential things to store durably are:
/// - generation numbers, as these must always advance monotonically to ensure data safety.
/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
/// - Node's scheduling policies, as the source of truth for these is something external.
///
/// Other things we store durably as an implementation detail:
/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
/// but it is operationally simpler to make this service the authority for which nodes
/// it talks to.
///
/// ## Performance/efficiency
///
/// The attachment service does not go via the database for most things: there are
/// a couple of places where we must, and where efficiency matters:
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
/// before it can attach a tenant, so this acts as a bound on how fast things like
/// failover can happen.
/// - Pageserver re-attach: we will increment many shards' generations when this happens,
/// so it is important to avoid e.g. issuing O(N) queries.
///
/// Database calls relating to nodes have low performance requirements, as they are very rarely
/// updated, and reads of nodes are always from memory, not the database. We only require that
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
/// Placeholder for storage. This will be replaced with a database client.
pub struct Persistence {
connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
// In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of
// test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
// compatible just yet.
json_path: Option<Utf8PathBuf>,
state: std::sync::Mutex<PersistentState>,
}
/// Legacy format, for use in JSON compat objects in test environment
// Top level state available to all HTTP handlers
#[derive(Serialize, Deserialize)]
struct JsonPersistence {
struct PersistentState {
tenants: HashMap<TenantShardId, TenantShardPersistence>,
#[serde(skip)]
path: Utf8PathBuf,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum DatabaseError {
#[error(transparent)]
Query(#[from] diesel::result::Error),
#[error(transparent)]
Connection(#[from] diesel::result::ConnectionError),
#[error(transparent)]
ConnectionPool(#[from] r2d2::Error),
#[error("Logical error: {0}")]
Logical(String),
/// A convenience for serializing the state inside a sync lock, and then
/// writing it to disk outside of the lock. This will go away when switching
/// to a database backend.
struct PendingWrite {
bytes: Vec<u8>,
path: Utf8PathBuf,
}
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
impl PendingWrite {
async fn commit(&self) -> anyhow::Result<()> {
tokio::fs::write(&self.path, &self.bytes).await?;
impl Persistence {
// The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under
// normal circumstances. This assumes we have exclusive use of the database cluster to which we connect.
pub const MAX_CONNECTIONS: u32 = 99;
Ok(())
}
}
// We don't want to keep a lot of connections alive: close them down promptly if they aren't being used.
const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
// We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
// to execute queries (database queries are not generally on latency-sensitive paths).
let connection_pool = diesel::r2d2::Pool::builder()
.max_size(Self::MAX_CONNECTIONS)
.max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
.idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
// Always keep at least one connection ready to go
.min_idle(Some(1))
.test_on_check_out(true)
.build(manager)
.expect("Could not build connection pool");
Self {
connection_pool,
json_path,
impl PersistentState {
fn save(&self) -> PendingWrite {
PendingWrite {
bytes: serde_json::to_vec(self).expect("Serialization error"),
path: self.path.clone(),
}
}
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
where
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
R: Send + 'static,
{
let mut conn = self.connection_pool.get()?;
tokio::task::spawn_blocking(move || -> DatabaseResult<R> { func(&mut conn) })
.await
.expect("Task panic")
}
async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
let bytes = tokio::fs::read(path).await?;
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
decoded.path = path.to_owned();
/// When a node is first registered, persist it before using it for anything
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
let np = node.to_persistent();
self.with_conn(move |conn| -> DatabaseResult<()> {
diesel::insert_into(crate::schema::nodes::table)
.values(&np)
.execute(conn)?;
Ok(())
})
.await
}
/// At startup, populate the list of nodes which our shards may be placed on
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
let nodes: Vec<Node> = self
.with_conn(move |conn| -> DatabaseResult<_> {
Ok(crate::schema::nodes::table
.load::<NodePersistence>(conn)?
.into_iter()
.map(|n| Node {
id: NodeId(n.node_id as u64),
// At startup we consider a node offline until proven otherwise.
availability: NodeAvailability::Offline,
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
.expect("Bad scheduling policy in DB"),
listen_http_addr: n.listen_http_addr,
listen_http_port: n.listen_http_port as u16,
listen_pg_addr: n.listen_pg_addr,
listen_pg_port: n.listen_pg_port as u16,
})
.collect::<Vec<Node>>())
})
.await?;
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
Ok(nodes)
}
/// At startup, load the high level state for shards, such as their config + policy. This will
/// be enriched at runtime with state discovered on pageservers.
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
let loaded = self
.with_conn(move |conn| -> DatabaseResult<_> {
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
})
.await?;
if loaded.is_empty() {
if let Some(path) = &self.json_path {
if tokio::fs::try_exists(path)
.await
.map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
{
tracing::info!("Importing from legacy JSON format at {path}");
return self.list_tenant_shards_json(path).await;
}
}
}
Ok(loaded)
}
/// Shim for automated compatibility tests: load tenants from a JSON file instead of database
pub(crate) async fn list_tenant_shards_json(
&self,
path: &Utf8Path,
) -> DatabaseResult<Vec<TenantShardPersistence>> {
let bytes = tokio::fs::read(path)
.await
.map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
for (tenant_id, tenant) in &mut decoded.tenants {
// Backward compat: an old attachments.json from before PR #6251, replace
// empty strings with proper defaults.
if tenant.tenant_id.is_empty() {
tenant.tenant_id = tenant_id.to_string();
tenant.config = serde_json::to_string(&TenantConfig::default())
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
tenant.tenant_id = format!("{}", tenant_id);
tenant.config = serde_json::to_string(&TenantConfig::default())?;
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
}
}
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
// Synchronize database with what is in the JSON file
self.insert_tenant_shards(tenants.clone()).await?;
Ok(tenants)
Ok(decoded)
}
/// For use in testing environments, where we dump out JSON on shutdown.
pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
let Some(path) = &self.json_path else {
anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
};
tracing::info!("Writing state to {path}...");
let tenants = self.list_tenant_shards().await?;
let mut tenants_map = HashMap::new();
for tsp in tenants {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount::new(tsp.shard_count as u8),
};
tenants_map.insert(tenant_shard_id, tsp);
async fn load_or_new(path: &Utf8Path) -> Self {
match Self::load(path).await {
Ok(s) => {
tracing::info!("Loaded state file at {}", path);
s
}
Err(e)
if e.downcast_ref::<std::io::Error>()
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
.unwrap_or(false) =>
{
tracing::info!("Will create state file at {}", path);
Self {
tenants: HashMap::new(),
path: path.to_owned(),
}
}
Err(e) => {
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
}
}
let json = serde_json::to_string(&JsonPersistence {
tenants: tenants_map,
})?;
}
}
tokio::fs::write(path, &json).await?;
tracing::info!("Wrote {} bytes to {path}...", json.len());
impl Persistence {
pub async fn new(path: &Utf8Path) -> Self {
let state = PersistentState::load_or_new(path).await;
Self {
state: std::sync::Mutex::new(state),
}
}
/// When registering a node, persist it so that on next start we will be able to
/// iterate over known nodes to synchronize their tenant shard states with our observed state.
pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
// TODO: node persitence will come with database backend
Ok(())
}
/// At startup, we populate the service's list of nodes, and use this list to call into
/// each node to do an initial reconciliation of the state of the world with our in-memory
/// observed state.
pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
let env = LocalEnv::load_config()?;
// TODO: node persitence will come with database backend
// XXX hack: enable test_backward_compatibility to work by populating our list of
// nodes from LocalEnv when it is not present in persistent storage. Otherwise at
// first startup in the compat test, we may have shards but no nodes.
let mut result = Vec::new();
tracing::info!(
"Loaded {} pageserver nodes from LocalEnv",
env.pageservers.len()
);
for ps_conf in env.pageservers {
let (pg_host, pg_port) =
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
result.push(Node {
id: ps_conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
});
}
Ok(result)
}
/// At startup, we populate our map of tenant shards from persistent storage.
pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
let locked = self.state.lock().unwrap();
Ok(locked.tenants.values().cloned().collect())
}
/// Tenants must be persisted before we schedule them for the first time. This enables us
/// to correctly retain generation monotonicity, and the externally provided placement policy & config.
pub(crate) async fn insert_tenant_shards(
&self,
shards: Vec<TenantShardPersistence>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
conn.transaction(|conn| -> QueryResult<()> {
for tenant in &shards {
diesel::insert_into(tenant_shards)
.values(tenant)
.execute(conn)?;
}
Ok(())
})?;
Ok(())
})
.await
}
) -> anyhow::Result<()> {
let write = {
let mut locked = self.state.lock().unwrap();
for shard in shards {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
shard_number: ShardNumber(shard.shard_number as u8),
shard_count: ShardCount(shard.shard_count as u8),
};
/// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
/// the tenant from memory on this server.
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
diesel::delete(tenant_shards)
.filter(tenant_id.eq(del_tenant_id.to_string()))
.execute(conn)?;
locked.tenants.insert(tenant_shard_id, shard);
}
locked.save()
};
Ok(())
})
.await
}
write.commit().await?;
pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
use crate::schema::nodes::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
diesel::delete(nodes)
.filter(node_id.eq(del_node_id.0 as i64))
.execute(conn)?;
Ok(())
})
.await
}
/// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
/// batched increment of the generations of all tenants whose generation_pageserver is equal to
/// the node that called /re-attach.
#[tracing::instrument(skip_all, fields(node_id))]
pub(crate) async fn re_attach(
&self,
node_id: NodeId,
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
use crate::schema::tenant_shards::dsl::*;
let updated = self
.with_conn(move |conn| {
let rows_updated = diesel::update(tenant_shards)
.filter(generation_pageserver.eq(node_id.0 as i64))
.set(generation.eq(generation + 1))
.execute(conn)?;
tracing::info!("Incremented {} tenants' generations", rows_updated);
// TODO: UPDATE+SELECT in one query
let updated = tenant_shards
.filter(generation_pageserver.eq(node_id.0 as i64))
.select(TenantShardPersistence::as_select())
.load(conn)?;
Ok(updated)
})
.await?;
let mut result = HashMap::new();
for tsp in updated {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount::new(tsp.shard_count as u8),
};
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
}
Ok(result)
Ok(())
}
/// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
@@ -332,153 +184,69 @@ impl Persistence {
pub(crate) async fn increment_generation(
&self,
tenant_shard_id: TenantShardId,
node_id: NodeId,
node_id: Option<NodeId>,
) -> anyhow::Result<Generation> {
use crate::schema::tenant_shards::dsl::*;
let updated = self
.with_conn(move |conn| {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.set((
generation.eq(generation + 1),
generation_pageserver.eq(node_id.0 as i64),
))
// TODO: only returning() the generation column
.returning(TenantShardPersistence::as_returning())
.get_result(conn)?;
let (write, gen) = {
let mut locked = self.state.lock().unwrap();
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
anyhow::bail!("Tried to increment generation of unknown shard");
};
Ok(updated)
})
.await?;
// If we're called with a None pageserver, we need only update the generation
// record to disassociate it with this pageserver, not actually increment the number, as
// the increment is guaranteed to happen the next time this tenant is attached.
if node_id.is_some() {
shard.generation += 1;
}
Ok(Generation::new(updated.generation as u32))
shard.generation_pageserver = node_id;
let gen = Generation::new(shard.generation);
(locked.save(), gen)
};
write.commit().await?;
Ok(gen)
}
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.set((
generation_pageserver.eq(i64::MAX),
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
))
.execute(conn)?;
Ok(updated)
})
.await?;
Ok(())
}
// When we start shard splitting, we must durably mark the tenant so that
// on restart, we know that we must go through recovery.
//
// We create the child shards here, so that they will be available for increment_generation calls
// if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
pub(crate) async fn begin_shard_split(
pub(crate) async fn re_attach(
&self,
old_shard_count: ShardCount,
split_tenant_id: TenantId,
parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
conn.transaction(|conn| -> DatabaseResult<()> {
// Mark parent shards as splitting
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(old_shard_count.literal() as i32))
.set((splitting.eq(1),))
.execute(conn)?;
if u8::try_from(updated)
.map_err(|_| DatabaseError::Logical(
format!("Overflow existing shard count {} while splitting", updated))
)? != old_shard_count.count() {
// Perhaps a deletion or another split raced with this attempt to split, mutating
// the parent shards that we intend to split. In this case the split request should fail.
return Err(DatabaseError::Logical(
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
));
node_id: NodeId,
) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
let (write, result) = {
let mut result = HashMap::new();
let mut locked = self.state.lock().unwrap();
for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
if shard.generation_pageserver == Some(node_id) {
shard.generation += 1;
result.insert(*tenant_shard_id, Generation::new(shard.generation));
}
}
// FIXME: spurious clone to sidestep closure move rules
let parent_to_children = parent_to_children.clone();
(locked.save(), result)
};
// Insert child shards
for (parent_shard_id, children) in parent_to_children {
let mut parent = crate::schema::tenant_shards::table
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
.load::<TenantShardPersistence>(conn)?;
let parent = if parent.len() != 1 {
return Err(DatabaseError::Logical(format!(
"Parent shard {parent_shard_id} not found"
)));
} else {
parent.pop().unwrap()
};
for mut shard in children {
// Carry the parent's generation into the child
shard.generation = parent.generation;
debug_assert!(shard.splitting == SplitState::Splitting);
diesel::insert_into(tenant_shards)
.values(shard)
.execute(conn)?;
}
}
Ok(())
})?;
Ok(())
})
.await
write.commit().await?;
Ok(result)
}
// When we finish shard splitting, we must atomically clean up the old shards
// TODO: when we start shard splitting, we must durably mark the tenant so that
// on restart, we know that we must go through recovery (list shards that exist
// and pick up where we left off and/or revert to parent shards).
#[allow(dead_code)]
pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
todo!();
}
// TODO: when we finish shard splitting, we must atomically clean up the old shards
// and insert the new shards, and clear the splitting marker.
pub(crate) async fn complete_shard_split(
&self,
split_tenant_id: TenantId,
old_shard_count: ShardCount,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
conn.transaction(|conn| -> QueryResult<()> {
// Drop parent shards
diesel::delete(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(old_shard_count.literal() as i32))
.execute(conn)?;
// Clear sharding flag
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.set((splitting.eq(0),))
.execute(conn)?;
debug_assert!(updated > 0);
Ok(())
})?;
Ok(())
})
.await
#[allow(dead_code)]
pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
todo!();
}
}
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
#[diesel(table_name = crate::schema::tenant_shards)]
#[derive(Serialize, Deserialize, Clone)]
pub(crate) struct TenantShardPersistence {
#[serde(default)]
pub(crate) tenant_id: String,
@@ -489,30 +257,16 @@ pub(crate) struct TenantShardPersistence {
#[serde(default)]
pub(crate) shard_stripe_size: i32,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching
pub(crate) generation: i32,
// Currently attached pageserver
#[serde(rename = "pageserver")]
pub(crate) generation_pageserver: i64,
pub(crate) generation_pageserver: Option<NodeId>,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching
pub(crate) generation: u32,
#[serde(default)]
pub(crate) placement_policy: String,
#[serde(default)]
pub(crate) splitting: SplitState,
#[serde(default)]
pub(crate) config: String,
}
/// Parts of [`crate::node::Node`] that are stored durably
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
#[diesel(table_name = crate::schema::nodes)]
pub(crate) struct NodePersistence {
pub(crate) node_id: i64,
pub(crate) scheduling_policy: String,
pub(crate) listen_http_addr: String,
pub(crate) listen_http_port: i32,
pub(crate) listen_pg_addr: String,
pub(crate) listen_pg_port: i32,
}

View File

@@ -1,46 +0,0 @@
use diesel::pg::{Pg, PgValue};
use diesel::{
deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql,
sql_types::Int2,
};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)]
#[diesel(sql_type = SplitStateSQLRepr)]
#[derive(Deserialize, Serialize)]
pub enum SplitState {
Idle = 0,
Splitting = 1,
}
impl Default for SplitState {
fn default() -> Self {
Self::Idle
}
}
type SplitStateSQLRepr = Int2;
impl ToSql<SplitStateSQLRepr, Pg> for SplitState {
fn to_sql<'a>(
&'a self,
out: &'a mut diesel::serialize::Output<Pg>,
) -> diesel::serialize::Result {
let raw_value: i16 = *self as i16;
let mut new_out = out.reborrow();
ToSql::<SplitStateSQLRepr, Pg>::to_sql(&raw_value, &mut new_out)
}
}
impl FromSql<SplitStateSQLRepr, Pg> for SplitState {
fn from_sql(pg_value: PgValue) -> diesel::deserialize::Result<Self> {
match FromSql::<SplitStateSQLRepr, Pg>::from_sql(pg_value).map(|v| match v {
0 => Some(Self::Idle),
1 => Some(Self::Splitting),
_ => None,
})? {
Some(v) => Ok(v),
None => Err(format!("Invalid SplitState value, was: {:?}", pg_value.as_bytes()).into()),
}
}
}

View File

@@ -13,9 +13,8 @@ use tokio_util::sync::CancellationToken;
use utils::generation::Generation;
use utils::id::{NodeId, TimelineId};
use utils::lsn::Lsn;
use utils::sync::gate::GateGuard;
use crate::compute_hook::{ComputeHook, NotifyError};
use crate::compute_hook::ComputeHook;
use crate::node::Node;
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
@@ -27,7 +26,7 @@ pub(super) struct Reconciler {
pub(super) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity,
pub(crate) generation: Generation,
pub(crate) intent: TargetState,
pub(crate) intent: IntentState,
pub(crate) config: TenantConfig,
pub(crate) observed: ObservedState,
@@ -38,15 +37,9 @@ pub(super) struct Reconciler {
pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
/// A hook to notify the running postgres instances when we change the location
/// of a tenant. Use this via [`Self::compute_notify`] to update our failure flag
/// and guarantee eventual retries.
/// of a tenant
pub(crate) compute_hook: Arc<ComputeHook>,
/// To avoid stalling if the cloud control plane is unavailable, we may proceed
/// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
/// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
pub(crate) compute_notify_failure: bool,
/// A means to abort background reconciliation: it is essential to
/// call this when something changes in the original TenantState that
/// will make this reconciliation impossible or unnecessary, for
@@ -54,46 +47,12 @@ pub(super) struct Reconciler {
/// the tenant is changed.
pub(crate) cancel: CancellationToken,
/// Reconcilers are registered with a Gate so that during a graceful shutdown we
/// can wait for all the reconcilers to respond to their cancellation tokens.
pub(crate) _gate_guard: GateGuard,
/// Access to persistent storage for updating generation numbers
pub(crate) persistence: Arc<Persistence>,
}
/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
/// and the TargetState is just the instruction for a particular Reconciler run.
#[derive(Debug)]
pub(crate) struct TargetState {
pub(crate) attached: Option<NodeId>,
pub(crate) secondary: Vec<NodeId>,
}
impl TargetState {
pub(crate) fn from_intent(intent: &IntentState) -> Self {
Self {
attached: *intent.get_attached(),
secondary: intent.get_secondary().clone(),
}
}
fn all_pageservers(&self) -> Vec<NodeId> {
let mut result = self.secondary.clone();
if let Some(node_id) = &self.attached {
result.push(*node_id);
}
result
}
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum ReconcileError {
#[error(transparent)]
Notify(#[from] NotifyError),
#[error("Cancelled")]
Cancel,
pub enum ReconcileError {
#[error(transparent)]
Other(#[from] anyhow::Error),
}
@@ -296,7 +255,7 @@ impl Reconciler {
secondary_conf,
tenant_conf: config.clone(),
shard_number: shard.number.0,
shard_count: shard.count.literal(),
shard_count: shard.count.0,
shard_stripe_size: shard.stripe_size.0,
}
}
@@ -337,7 +296,7 @@ impl Reconciler {
// Increment generation before attaching to new pageserver
self.generation = self
.persistence
.increment_generation(self.tenant_shard_id, dest_ps_id)
.increment_generation(self.tenant_shard_id, Some(dest_ps_id))
.await?;
let dest_conf = build_location_config(
@@ -358,19 +317,9 @@ impl Reconciler {
}
tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
// During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
// the origin without notifying compute, we will render the tenant unavailable.
while let Err(e) = self.compute_notify().await {
match e {
NotifyError::Fatal(_) => return Err(anyhow::anyhow!(e)),
_ => {
tracing::warn!(
"Live migration blocked by compute notification error, retrying: {e}"
);
}
}
}
self.compute_hook
.notify(self.tenant_shard_id, dest_ps_id)
.await?;
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then
// this location will be deleted in the general case reconciliation that runs after this.
@@ -446,12 +395,20 @@ impl Reconciler {
// as locations with unknown (None) observed state.
self.generation = self
.persistence
.increment_generation(self.tenant_shard_id, node_id)
.increment_generation(self.tenant_shard_id, Some(node_id))
.await?;
wanted_conf.generation = self.generation.into();
tracing::info!("Observed configuration requires update.");
self.location_config(node_id, wanted_conf, None).await?;
self.compute_notify().await?;
if let Err(e) = self
.compute_hook
.notify(self.tenant_shard_id, node_id)
.await
{
tracing::warn!(
"Failed to notify compute of newly attached pageserver {node_id}: {e}"
);
}
}
}
}
@@ -491,7 +448,7 @@ impl Reconciler {
generation: None,
secondary_conf: None,
shard_number: self.shard.number.0,
shard_count: self.shard.count.literal(),
shard_count: self.shard.count.0,
shard_stripe_size: self.shard.stripe_size.0,
tenant_conf: self.config.clone(),
},
@@ -499,37 +456,11 @@ impl Reconciler {
}
for (node_id, conf) in changes {
if self.cancel.is_cancelled() {
return Err(ReconcileError::Cancel);
}
self.location_config(node_id, conf, None).await?;
}
Ok(())
}
pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> {
// Whenever a particular Reconciler emits a notification, it is always notifying for the intended
// destination.
if let Some(node_id) = self.intent.attached {
let result = self
.compute_hook
.notify(self.tenant_shard_id, node_id, &self.cancel)
.await;
if let Err(e) = &result {
// It is up to the caller whether they want to drop out on this error, but they don't have to:
// in general we should avoid letting unavailability of the cloud control plane stop us from
// making progress.
tracing::warn!("Failed to notify compute of attached pageserver {node_id}: {e}");
// Set this flag so that in our ReconcileResult we will set the flag on the shard that it
// needs to retry at some point.
self.compute_notify_failure = true;
}
result
} else {
Ok(())
}
}
}
pub(crate) fn attached_location_conf(
@@ -542,7 +473,7 @@ pub(crate) fn attached_location_conf(
generation: generation.into(),
secondary_conf: None,
shard_number: shard.number.0,
shard_count: shard.count.literal(),
shard_count: shard.count.0,
shard_stripe_size: shard.stripe_size.0,
tenant_conf: config.clone(),
}
@@ -557,7 +488,7 @@ pub(crate) fn secondary_location_conf(
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: shard.number.0,
shard_count: shard.count.literal(),
shard_count: shard.count.0,
shard_stripe_size: shard.stripe_size.0,
tenant_conf: config.clone(),
}

View File

@@ -1,8 +1,9 @@
use crate::{node::Node, tenant_state::TenantState};
use serde::Serialize;
use std::collections::HashMap;
use pageserver_api::shard::TenantShardId;
use std::collections::{BTreeMap, HashMap};
use utils::{http::error::ApiError, id::NodeId};
use crate::{node::Node, tenant_state::TenantState};
/// Scenarios in which we cannot find a suitable location for a tenant shard
#[derive(thiserror::Error, Debug)]
pub enum ScheduleError {
@@ -18,179 +19,52 @@ impl From<ScheduleError> for ApiError {
}
}
#[derive(Serialize, Eq, PartialEq)]
struct SchedulerNode {
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
shard_count: usize,
/// Whether this node is currently elegible to have new shards scheduled (this is derived
/// from a node's availability state and scheduling policy).
may_schedule: bool,
}
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
/// on which to run.
///
/// The type has no persistent state of its own: this is all populated at startup. The Serialize
/// impl is only for debug dumps.
#[derive(Serialize)]
pub(crate) struct Scheduler {
nodes: HashMap<NodeId, SchedulerNode>,
tenant_counts: HashMap<NodeId, usize>,
}
impl Scheduler {
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
let mut scheduler_nodes = HashMap::new();
for node in nodes {
scheduler_nodes.insert(
node.id,
SchedulerNode {
shard_count: 0,
may_schedule: node.may_schedule(),
},
);
pub(crate) fn new(
tenants: &BTreeMap<TenantShardId, TenantState>,
nodes: &HashMap<NodeId, Node>,
) -> Self {
let mut tenant_counts = HashMap::new();
for node_id in nodes.keys() {
tenant_counts.insert(*node_id, 0);
}
Self {
nodes: scheduler_nodes,
}
}
/// For debug/support: check that our internal statistics are in sync with the state of
/// the nodes & tenant shards.
///
/// If anything is inconsistent, log details and return an error.
pub(crate) fn consistency_check<'a>(
&self,
nodes: impl Iterator<Item = &'a Node>,
shards: impl Iterator<Item = &'a TenantState>,
) -> anyhow::Result<()> {
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
for node in nodes {
expect_nodes.insert(
node.id,
SchedulerNode {
shard_count: 0,
may_schedule: node.may_schedule(),
},
);
}
for shard in shards {
if let Some(node_id) = shard.intent.get_attached() {
match expect_nodes.get_mut(node_id) {
Some(node) => node.shard_count += 1,
None => anyhow::bail!(
"Tenant {} references nonexistent node {}",
shard.tenant_shard_id,
node_id
),
}
}
for node_id in shard.intent.get_secondary() {
match expect_nodes.get_mut(node_id) {
Some(node) => node.shard_count += 1,
None => anyhow::bail!(
"Tenant {} references nonexistent node {}",
shard.tenant_shard_id,
node_id
),
}
for tenant in tenants.values() {
if let Some(ps) = tenant.intent.attached {
let entry = tenant_counts.entry(ps).or_insert(0);
*entry += 1;
}
}
for (node_id, expect_node) in &expect_nodes {
let Some(self_node) = self.nodes.get(node_id) else {
anyhow::bail!("Node {node_id} not found in Self")
};
if self_node != expect_node {
tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
anyhow::bail!("Inconsistent state on {node_id}");
for (node_id, node) in nodes {
if !node.may_schedule() {
tenant_counts.remove(node_id);
}
}
if expect_nodes.len() != self.nodes.len() {
// We just checked that all the expected nodes are present. If the lengths don't match,
// it means that we have nodes in Self that are unexpected.
for node_id in self.nodes.keys() {
if !expect_nodes.contains_key(node_id) {
anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
}
}
}
Ok(())
}
/// Increment the reference count of a node. This reference count is used to guide scheduling
/// decisions, not for memory management: it represents one tenant shard whose IntentState targets
/// this node.
///
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
/// [`Self::new`] or [`Self::node_upsert`])
pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
let Some(node) = self.nodes.get_mut(&node_id) else {
tracing::error!("Scheduler missing node {node_id}");
debug_assert!(false);
return;
};
node.shard_count += 1;
}
/// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`].
pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
let Some(node) = self.nodes.get_mut(&node_id) else {
debug_assert!(false);
tracing::error!("Scheduler missing node {node_id}");
return;
};
node.shard_count -= 1;
}
pub(crate) fn node_upsert(&mut self, node: &Node) {
use std::collections::hash_map::Entry::*;
match self.nodes.entry(node.id) {
Occupied(mut entry) => {
entry.get_mut().may_schedule = node.may_schedule();
}
Vacant(entry) => {
entry.insert(SchedulerNode {
shard_count: 0,
may_schedule: node.may_schedule(),
});
}
}
}
pub(crate) fn node_remove(&mut self, node_id: NodeId) {
if self.nodes.remove(&node_id).is_none() {
tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
}
Self { tenant_counts }
}
pub(crate) fn schedule_shard(
&mut self,
hard_exclude: &[NodeId],
) -> Result<NodeId, ScheduleError> {
if self.nodes.is_empty() {
if self.tenant_counts.is_empty() {
return Err(ScheduleError::NoPageservers);
}
let mut tenant_counts: Vec<(NodeId, usize)> = self
.nodes
.tenant_counts
.iter()
.filter_map(|(k, v)| {
if hard_exclude.contains(k) || !v.may_schedule {
if hard_exclude.contains(k) {
None
} else {
Some((*k, v.shard_count))
Some((*k, *v))
}
})
.collect();
@@ -199,108 +73,17 @@ impl Scheduler {
tenant_counts.sort_by_key(|i| (i.1, i.0));
if tenant_counts.is_empty() {
// After applying constraints, no pageservers were left. We log some detail about
// the state of nodes to help understand why this happened. This is not logged as an error because
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
for (node_id, node) in &self.nodes {
tracing::info!(
"Node {node_id}: may_schedule={} shards={}",
node.may_schedule,
node.shard_count
);
}
// After applying constraints, no pageservers were left
return Err(ScheduleError::ImpossibleConstraint);
}
for (node_id, count) in &tenant_counts {
tracing::info!("tenant_counts[{node_id}]={count}");
}
let node_id = tenant_counts.first().unwrap().0;
tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
);
// Note that we do not update shard count here to reflect the scheduling: that
// is IntentState's job when the scheduled location is used.
tracing::info!("scheduler selected node {node_id}");
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
Ok(node_id)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use utils::id::NodeId;
use crate::{node::Node, tenant_state::IntentState};
#[test]
fn scheduler_basic() -> anyhow::Result<()> {
let mut nodes = HashMap::new();
nodes.insert(
NodeId(1),
Node {
id: NodeId(1),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
listen_http_addr: String::new(),
listen_http_port: 0,
listen_pg_addr: String::new(),
listen_pg_port: 0,
},
);
nodes.insert(
NodeId(2),
Node {
id: NodeId(2),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
listen_http_addr: String::new(),
listen_http_port: 0,
listen_pg_addr: String::new(),
listen_pg_port: 0,
},
);
let mut scheduler = Scheduler::new(nodes.values());
let mut t1_intent = IntentState::new();
let mut t2_intent = IntentState::new();
let scheduled = scheduler.schedule_shard(&[])?;
t1_intent.set_attached(&mut scheduler, Some(scheduled));
let scheduled = scheduler.schedule_shard(&[])?;
t2_intent.set_attached(&mut scheduler, Some(scheduled));
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
t1_intent.push_secondary(&mut scheduler, scheduled);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
t1_intent.clear(&mut scheduler);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
if cfg!(debug_assertions) {
// Dropping an IntentState without clearing it causes a panic in debug mode,
// because we have failed to properly update scheduler shard counts.
let result = std::panic::catch_unwind(move || {
drop(t2_intent);
});
assert!(result.is_err());
} else {
t2_intent.clear(&mut scheduler);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
}
Ok(())
}
}

View File

@@ -1,28 +0,0 @@
// @generated automatically by Diesel CLI.
diesel::table! {
nodes (node_id) {
node_id -> Int8,
scheduling_policy -> Varchar,
listen_http_addr -> Varchar,
listen_http_port -> Int4,
listen_pg_addr -> Varchar,
listen_pg_port -> Int4,
}
}
diesel::table! {
tenant_shards (tenant_id, shard_number, shard_count) {
tenant_id -> Varchar,
shard_number -> Int4,
shard_count -> Int4,
shard_stripe_size -> Int4,
generation -> Int4,
generation_pageserver -> Int8,
placement_policy -> Varchar,
splitting -> Int2,
config -> Text,
}
}
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);

File diff suppressed because it is too large Load Diff

View File

@@ -1,47 +1,27 @@
use std::{collections::HashMap, sync::Arc, time::Duration};
use crate::{metrics, persistence::TenantShardPersistence};
use control_plane::attachment_service::NodeAvailability;
use pageserver_api::{
models::{LocationConfig, LocationConfigMode, TenantConfig},
shard::{ShardIdentity, TenantShardId},
};
use serde::Serialize;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use tracing::{instrument, Instrument};
use utils::{
generation::Generation,
id::NodeId,
seqwait::{SeqWait, SeqWaitError},
sync::gate::Gate,
};
use crate::{
compute_hook::ComputeHook,
node::Node,
persistence::{split_state::SplitState, Persistence},
reconciler::{
attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
},
persistence::Persistence,
reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
scheduler::{ScheduleError, Scheduler},
service, PlacementPolicy, Sequence,
};
/// Serialization helper
fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
T: Clone + std::fmt::Display,
{
serializer.collect_str(&v.lock().unwrap())
}
/// In-memory state for a particular tenant shard.
///
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
#[derive(Serialize)]
pub(crate) struct TenantState {
pub(crate) tenant_shard_id: TenantShardId,
@@ -76,144 +56,30 @@ pub(crate) struct TenantState {
/// If a reconcile task is currently in flight, it may be joined here (it is
/// only safe to join if either the result has been received or the reconciler's
/// cancellation token has been fired)
#[serde(skip)]
pub(crate) reconciler: Option<ReconcilerHandle>,
/// If a tenant is being split, then all shards with that TenantId will have a
/// SplitState set, this acts as a guard against other operations such as background
/// reconciliation, and timeline creation.
pub(crate) splitting: SplitState,
/// Optionally wait for reconciliation to complete up to a particular
/// sequence number.
#[serde(skip)]
pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
/// Indicates sequence number for which we have encountered an error reconciling. If
/// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
/// and callers should stop waiting for `waiter` and propagate the error.
#[serde(skip)]
pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
/// The most recent error from a reconcile on this tenant
/// TODO: generalize to an array of recent events
/// TOOD: use a ArcSwap instead of mutex for faster reads?
#[serde(serialize_with = "read_mutex_content")]
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
/// If we have a pending compute notification that for some reason we weren't able to send,
/// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry
/// sending it. This is the mechanism by which compute notifications are included in the scope
/// of state that we publish externally in an eventually consistent way.
pub(crate) pending_compute_notification: bool,
}
#[derive(Default, Clone, Debug, Serialize)]
#[derive(Default, Clone, Debug)]
pub(crate) struct IntentState {
attached: Option<NodeId>,
secondary: Vec<NodeId>,
pub(crate) attached: Option<NodeId>,
pub(crate) secondary: Vec<NodeId>,
}
impl IntentState {
pub(crate) fn new() -> Self {
Self {
attached: None,
secondary: vec![],
}
}
pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
if let Some(node_id) = node_id {
scheduler.node_inc_ref(node_id);
}
Self {
attached: node_id,
secondary: vec![],
}
}
pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
if self.attached != new_attached {
if let Some(old_attached) = self.attached.take() {
scheduler.node_dec_ref(old_attached);
}
if let Some(new_attached) = &new_attached {
scheduler.node_inc_ref(*new_attached);
}
self.attached = new_attached;
}
}
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
debug_assert!(!self.secondary.contains(&new_secondary));
scheduler.node_inc_ref(new_secondary);
self.secondary.push(new_secondary);
}
/// It is legal to call this with a node that is not currently a secondary: that is a no-op
pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
let index = self.secondary.iter().position(|n| *n == node_id);
if let Some(index) = index {
scheduler.node_dec_ref(node_id);
self.secondary.remove(index);
}
}
pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
for secondary in self.secondary.drain(..) {
scheduler.node_dec_ref(secondary);
}
}
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
if let Some(old_attached) = self.attached.take() {
scheduler.node_dec_ref(old_attached);
}
self.clear_secondary(scheduler);
}
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
let mut result = Vec::new();
if let Some(p) = self.attached {
result.push(p)
}
result.extend(self.secondary.iter().copied());
result
}
pub(crate) fn get_attached(&self) -> &Option<NodeId> {
&self.attached
}
pub(crate) fn get_secondary(&self) -> &Vec<NodeId> {
&self.secondary
}
/// When a node goes offline, we update intents to avoid using it
/// as their attached pageserver.
///
/// Returns true if a change was made
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
if self.attached == Some(node_id) {
self.attached = None;
self.secondary.push(node_id);
true
} else {
false
}
}
}
impl Drop for IntentState {
fn drop(&mut self) {
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
}
}
#[derive(Default, Clone, Serialize)]
#[derive(Default, Clone)]
pub(crate) struct ObservedState {
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
}
@@ -227,7 +93,7 @@ pub(crate) struct ObservedState {
/// what it is (e.g. we failed partway through configuring it)
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
/// and that configuration will still be present unless something external interfered.
#[derive(Clone, Serialize)]
#[derive(Clone)]
pub(crate) struct ObservedStateLocation {
/// If None, it means we do not know the status of this shard's location on this node, but
/// we know that we might have some state on this node.
@@ -298,9 +164,39 @@ pub(crate) struct ReconcileResult {
pub(crate) tenant_shard_id: TenantShardId,
pub(crate) generation: Generation,
pub(crate) observed: ObservedState,
}
/// Set [`TenantState::pending_compute_notification`] from this flag
pub(crate) pending_compute_notification: bool,
impl IntentState {
pub(crate) fn new() -> Self {
Self {
attached: None,
secondary: vec![],
}
}
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
let mut result = Vec::new();
if let Some(p) = self.attached {
result.push(p)
}
result.extend(self.secondary.iter().copied());
result
}
/// When a node goes offline, we update intents to avoid using it
/// as their attached pageserver.
///
/// Returns true if a change was made
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
if self.attached == Some(node_id) {
self.attached = None;
self.secondary.push(node_id);
true
} else {
false
}
}
}
impl ObservedState {
@@ -326,12 +222,10 @@ impl TenantState {
observed: ObservedState::default(),
config: TenantConfig::default(),
reconciler: None,
splitting: SplitState::Idle,
sequence: Sequence(1),
waiter: Arc::new(SeqWait::new(Sequence(0))),
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
last_error: Arc::default(),
pending_compute_notification: false,
}
}
@@ -382,9 +276,6 @@ impl TenantState {
// self.intent refers to pageservers that are offline, and pick other
// pageservers if so.
// TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
// change their attach location.
// Build the set of pageservers already in use by this tenant, to avoid scheduling
// more work on the same pageservers we're already using.
let mut used_pageservers = self.intent.all_pageservers();
@@ -396,12 +287,12 @@ impl TenantState {
// Should have exactly one attached, and zero secondaries
if self.intent.attached.is_none() {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.set_attached(scheduler, Some(node_id));
self.intent.attached = Some(node_id);
used_pageservers.push(node_id);
modified = true;
}
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
self.intent.secondary.clear();
modified = true;
}
}
@@ -409,30 +300,18 @@ impl TenantState {
// Should have exactly one attached, and N secondaries
if self.intent.attached.is_none() {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.set_attached(scheduler, Some(node_id));
self.intent.attached = Some(node_id);
used_pageservers.push(node_id);
modified = true;
}
while self.intent.secondary.len() < secondary_count {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.push_secondary(scheduler, node_id);
self.intent.secondary.push(node_id);
used_pageservers.push(node_id);
modified = true;
}
}
Detached => {
// Should have no attached or secondary pageservers
if self.intent.attached.is_some() {
self.intent.set_attached(scheduler, None);
modified = true;
}
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
modified = true;
}
}
}
if modified {
@@ -442,38 +321,6 @@ impl TenantState {
Ok(())
}
/// Query whether the tenant's observed state for attached node matches its intent state, and if so,
/// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that
/// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
///
/// Reconciliation may still be needed for other aspects of state such as secondaries (see [`Self::dirty`]): this
/// funciton should not be used to decide whether to reconcile.
pub(crate) fn stably_attached(&self) -> Option<NodeId> {
if let Some(attach_intent) = self.intent.attached {
match self.observed.locations.get(&attach_intent) {
Some(loc) => match &loc.conf {
Some(conf) => match conf.mode {
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// Our intent and observed state agree that this node is in an attached state.
Some(attach_intent)
}
// Our observed config is not an attached state
_ => None,
},
// Our observed state is None, i.e. in flux
None => None,
},
// We have no observed state for this node
None => None,
}
} else {
// Our intent is not to attach
None
}
}
fn dirty(&self) -> bool {
if let Some(node_id) = self.intent.attached {
let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
@@ -495,17 +342,9 @@ impl TenantState {
}
}
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
// wake up a reconciler to send it.
if self.pending_compute_notification {
return true;
}
false
}
#[allow(clippy::too_many_arguments)]
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn maybe_reconcile(
&mut self,
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
@@ -513,8 +352,6 @@ impl TenantState {
compute_hook: &Arc<ComputeHook>,
service_config: &service::Config,
persistence: &Arc<Persistence>,
gate: &Gate,
cancel: &CancellationToken,
) -> Option<ReconcilerWaiter> {
// If there are any ambiguous observed states, and the nodes they refer to are available,
// we should reconcile to clean them up.
@@ -536,14 +373,6 @@ impl TenantState {
return None;
}
// If we are currently splitting, then never start a reconciler task: the splitting logic
// requires that shards are not interfered with while it runs. Do this check here rather than
// up top, so that we only log this message if we would otherwise have done a reconciliation.
if !matches!(self.splitting, SplitState::Idle) {
tracing::info!("Refusing to reconcile, splitting in progress");
return None;
}
// Reconcile already in flight for the current sequence?
if let Some(handle) = &self.reconciler {
if handle.sequence == self.sequence {
@@ -561,101 +390,58 @@ impl TenantState {
// doing our sequence's work.
let old_handle = self.reconciler.take();
let Ok(gate_guard) = gate.enter() else {
// Shutting down, don't start a reconciler
return None;
};
let reconciler_cancel = cancel.child_token();
let cancel = CancellationToken::new();
let mut reconciler = Reconciler {
tenant_shard_id: self.tenant_shard_id,
shard: self.shard,
generation: self.generation,
intent: TargetState::from_intent(&self.intent),
intent: self.intent.clone(),
config: self.config.clone(),
observed: self.observed.clone(),
pageservers: pageservers.clone(),
compute_hook: compute_hook.clone(),
service_config: service_config.clone(),
_gate_guard: gate_guard,
cancel: reconciler_cancel.clone(),
cancel: cancel.clone(),
persistence: persistence.clone(),
compute_notify_failure: false,
};
let reconcile_seq = self.sequence;
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
let must_notify = self.pending_compute_notification;
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
tenant_id=%reconciler.tenant_shard_id.tenant_id,
shard_id=%reconciler.tenant_shard_id.shard_slug());
metrics::RECONCILER.spawned.inc();
let join_handle = tokio::task::spawn(
async move {
// Wait for any previous reconcile task to complete before we start
if let Some(old_handle) = old_handle {
old_handle.cancel.cancel();
if let Err(e) = old_handle.handle.await {
// We can't do much with this other than log it: the task is done, so
// we may proceed with our work.
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
}
tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
let join_handle = tokio::task::spawn(async move {
// Wait for any previous reconcile task to complete before we start
if let Some(old_handle) = old_handle {
old_handle.cancel.cancel();
if let Err(e) = old_handle.handle.await {
// We can't do much with this other than log it: the task is done, so
// we may proceed with our work.
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
}
// Early check for cancellation before doing any work
// TODO: wrap all remote API operations in cancellation check
// as well.
if reconciler.cancel.is_cancelled() {
metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
.inc();
return;
}
// Attempt to make observed state match intent state
let result = reconciler.reconcile().await;
// If we know we had a pending compute notification from some previous action, send a notification irrespective
// of whether the above reconcile() did any work
if result.is_ok() && must_notify {
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
reconciler.compute_notify().await.ok();
}
// Update result counter
match &result {
Ok(_) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
Err(ReconcileError::Cancel) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
Err(_) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
}
.inc();
result_tx
.send(ReconcileResult {
sequence: reconcile_seq,
result,
tenant_shard_id: reconciler.tenant_shard_id,
generation: reconciler.generation,
observed: reconciler.observed,
pending_compute_notification: reconciler.compute_notify_failure,
})
.ok();
}
.instrument(reconciler_span),
);
// Early check for cancellation before doing any work
// TODO: wrap all remote API operations in cancellation check
// as well.
if reconciler.cancel.is_cancelled() {
return;
}
let result = reconciler.reconcile().await;
result_tx
.send(ReconcileResult {
sequence: reconcile_seq,
result,
tenant_shard_id: reconciler.tenant_shard_id,
generation: reconciler.generation,
observed: reconciler.observed,
})
.ok();
});
self.reconciler = Some(ReconcilerHandle {
sequence: self.sequence,
handle: join_handle,
cancel: reconciler_cancel,
cancel,
});
Some(ReconcilerWaiter {
@@ -666,32 +452,4 @@ impl TenantState {
seq: self.sequence,
})
}
// If we had any state at all referring to this node ID, drop it. Does not
// attempt to reschedule.
pub(crate) fn deref_node(&mut self, node_id: NodeId) {
if self.intent.attached == Some(node_id) {
self.intent.attached = None;
}
self.intent.secondary.retain(|n| n != &node_id);
self.observed.locations.remove(&node_id);
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
}
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
TenantShardPersistence {
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
shard_number: self.tenant_shard_id.shard_number.0 as i32,
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: self.shard.stripe_size.0 as i32,
generation: self.generation.into().unwrap_or(0) as i32,
generation_pageserver: i64::MAX,
placement_policy: serde_json::to_string(&self.policy).unwrap(),
config: serde_json::to_string(&self.config).unwrap(),
splitting: SplitState::default(),
}
}
}

View File

@@ -1,20 +1,16 @@
use crate::{background_process, local_env::LocalEnv};
use camino::{Utf8Path, Utf8PathBuf};
use camino::Utf8PathBuf;
use hyper::Method;
use pageserver_api::{
models::{
ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
TimelineCreateRequest, TimelineInfo,
},
models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
shard::TenantShardId,
};
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::str::FromStr;
use tokio::process::Command;
use std::{path::PathBuf, process::Child, str::FromStr};
use tracing::instrument;
use url::Url;
use utils::{
auth::{Claims, Scope},
id::{NodeId, TenantId},
@@ -23,17 +19,14 @@ use utils::{
pub struct AttachmentService {
env: LocalEnv,
listen: String,
path: Utf8PathBuf,
path: PathBuf,
jwt_token: Option<String>,
public_key: Option<String>,
postgres_port: u16,
public_key_path: Option<Utf8PathBuf>,
client: reqwest::Client,
}
const COMMAND: &str = "attachment_service";
const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
@@ -57,7 +50,6 @@ pub struct InspectResponse {
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub generation: u32,
}
@@ -113,7 +105,7 @@ pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Clone, Copy)]
pub enum NodeAvailability {
// Normal, happy state
Active,
@@ -137,7 +129,7 @@ impl FromStr for NodeAvailability {
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
/// type needs to be defined with diesel traits in there.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Clone, Copy)]
pub enum NodeSchedulingPolicy {
Active,
Filling,
@@ -177,9 +169,7 @@ pub struct TenantShardMigrateResponse {}
impl AttachmentService {
pub fn from_env(env: &LocalEnv) -> Self {
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
.unwrap()
.join("attachments.json");
let path = env.base_data_dir.join("attachments.json");
// Makes no sense to construct this if pageservers aren't going to use it: assume
// pageservers have control plane API set
@@ -191,20 +181,13 @@ impl AttachmentService {
listen_url.port().unwrap()
);
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
// port, for use by our captive postgres.
let postgres_port = listen_url
.port()
.expect("Control plane API setting should always have a port")
+ 1;
// Assume all pageservers have symmetric auth configuration: this service
// expects to use one JWT token to talk to all of them.
let ps_conf = env
.pageservers
.first()
.expect("Config is validated to contain at least one pageserver");
let (jwt_token, public_key) = match ps_conf.http_auth_type {
let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
AuthType::Trust => (None, None),
AuthType::NeonJWT => {
let jwt_token = env
@@ -216,26 +199,7 @@ impl AttachmentService {
let public_key_path =
camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
.unwrap();
// This service takes keys as a string rather than as a path to a file/dir: read the key into memory.
let public_key = if std::fs::metadata(&public_key_path)
.expect("Can't stat public key")
.is_dir()
{
// Our config may specify a directory: this is for the pageserver's ability to handle multiple
// keys. We only use one key at a time, so, arbitrarily load the first one in the directory.
let mut dir =
std::fs::read_dir(&public_key_path).expect("Can't readdir public key path");
let dent = dir
.next()
.expect("Empty key dir")
.expect("Error reading key dir");
std::fs::read_to_string(dent.path()).expect("Can't read public key")
} else {
std::fs::read_to_string(&public_key_path).expect("Can't read public key")
};
(Some(jwt_token), Some(public_key))
(Some(jwt_token), Some(public_key_path))
}
};
@@ -244,8 +208,7 @@ impl AttachmentService {
path,
listen,
jwt_token,
public_key,
postgres_port,
public_key_path,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
@@ -257,161 +220,22 @@ impl AttachmentService {
.expect("non-Unicode path")
}
/// PIDFile for the postgres instance used to store attachment service state
fn postgres_pid_file(&self) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(
self.env
.base_data_dir
.join("attachment_service_postgres.pid"),
)
.expect("non-Unicode path")
}
pub async fn start(&self) -> anyhow::Result<Child> {
let path_str = self.path.to_string_lossy();
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
///
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
/// to other versions if that one isn't found. Some automated tests create circumstances
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
for v in prefer_versions {
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
if tokio::fs::try_exists(&path).await? {
return Ok(path);
}
}
// Fall through
anyhow::bail!(
"Postgres binaries not found in {}",
self.env.pg_distrib_dir.display()
);
}
/// Readiness check for our postgres process
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
let bin_path = pg_bin_dir.join("pg_isready");
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
Ok(exitcode.success())
}
/// Create our database if it doesn't exist, and run migrations.
///
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
/// who just want to run `cargo neon_local` without knowing about diesel.
///
/// Returns the database url
pub async fn setup_database(&self) -> anyhow::Result<String> {
const DB_NAME: &str = "attachment_service";
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
let pg_bin_dir = self.get_pg_bin_dir().await?;
let createdb_path = pg_bin_dir.join("createdb");
let output = Command::new(&createdb_path)
.args([
"-h",
"localhost",
"-p",
&format!("{}", self.postgres_port),
&DB_NAME,
])
.output()
.await
.expect("Failed to spawn createdb");
if !output.status.success() {
let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb");
if stderr.contains("already exists") {
tracing::info!("Database {DB_NAME} already exists");
} else {
anyhow::bail!("createdb failed with status {}: {stderr}", output.status);
}
}
Ok(database_url)
}
pub async fn start(&self) -> anyhow::Result<()> {
// Start a vanilla Postgres process used by the attachment service for persistence.
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
.unwrap()
.join("attachment_service_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_log_path = pg_data_path.join("postgres.log");
if !tokio::fs::try_exists(&pg_data_path).await? {
// Initialize empty database
let initdb_path = pg_bin_dir.join("initdb");
let mut child = Command::new(&initdb_path)
.args(["-D", pg_data_path.as_ref()])
.spawn()
.expect("Failed to spawn initdb");
let status = child.wait().await?;
if !status.success() {
anyhow::bail!("initdb failed with status {status}");
}
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}", self.postgres_port),
)
.await?;
};
println!("Starting attachment service database...");
let db_start_args = [
"-w",
"-D",
pg_data_path.as_ref(),
"-l",
pg_log_path.as_ref(),
"start",
];
background_process::start_process(
"attachment_service_db",
&self.env.base_data_dir,
pg_bin_dir.join("pg_ctl").as_std_path(),
db_start_args,
[],
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|| self.pg_isready(&pg_bin_dir),
)
.await?;
// Run migrations on every startup, in case something changed.
let database_url = self.setup_database().await?;
let mut args = vec![
"-l",
&self.listen,
"-p",
self.path.as_ref(),
"--database-url",
&database_url,
]
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
let mut args = vec!["-l", &self.listen, "-p", &path_str]
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
if let Some(jwt_token) = &self.jwt_token {
args.push(format!("--jwt-token={jwt_token}"));
}
if let Some(public_key) = &self.public_key {
args.push(format!("--public-key=\"{public_key}\""));
if let Some(public_key_path) = &self.public_key_path {
args.push(format!("--public-key={public_key_path}"));
}
if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
args.push(format!(
"--compute-hook-url={control_plane_compute_hook_api}"
));
}
background_process::start_process(
let result = background_process::start_process(
COMMAND,
&self.env.base_data_dir,
&self.env.attachment_service_bin(),
@@ -428,46 +252,29 @@ impl AttachmentService {
}
},
)
.await?;
.await;
Ok(())
}
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
println!("Stopping attachment service database...");
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_stop_args)
.spawn()?
.wait()
for ps_conf in &self.env.pageservers {
let (pg_host, pg_port) =
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
self.node_register(NodeRegisterRequest {
node_id: ps_conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
if !stop_status.success() {
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_status_args)
.spawn()?
.wait()
.await?;
// pg_ctl status returns this exit code if postgres is not running: in this case it is
// fine that stop failed. Otherwise it is an error that stop failed.
const PG_STATUS_NOT_RUNNING: i32 = 3;
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
println!("Attachment service data base is already stopped");
return Ok(());
} else {
anyhow::bail!("Failed to stop attachment service database: {stop_status}")
}
}
Ok(())
result
}
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(immediate, COMMAND, &self.pid_file())
}
/// Simple HTTP request wrapper for calling into attachment service
async fn dispatch<RQ, RS>(
&self,
@@ -479,15 +286,13 @@ impl AttachmentService {
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let listen_url = self.env.control_plane_api.clone().unwrap();
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
listen_url.host_str().unwrap(),
listen_url.port().unwrap()
))
.unwrap();
let url = self
.env
.control_plane_api
.clone()
.unwrap()
.join(&path)
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {
@@ -524,7 +329,7 @@ impl AttachmentService {
let response = self
.dispatch::<_, AttachHookResponse>(
Method::POST,
"debug/v1/attach-hook".to_string(),
"attach-hook".to_string(),
Some(request),
)
.await?;
@@ -540,11 +345,7 @@ impl AttachmentService {
let request = InspectRequest { tenant_shard_id };
let response = self
.dispatch::<_, InspectResponse>(
Method::POST,
"debug/v1/inspect".to_string(),
Some(request),
)
.dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
.await?;
Ok(response.attachment)
@@ -555,18 +356,14 @@ impl AttachmentService {
&self,
req: TenantCreateRequest,
) -> anyhow::Result<TenantCreateResponse> {
self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
self.dispatch(Method::POST, "tenant".to_string(), Some(req))
.await
}
#[instrument(skip(self))]
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
self.dispatch::<(), _>(
Method::GET,
format!("control/v1/tenant/{tenant_id}/locate"),
None,
)
.await
self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
.await
}
#[instrument(skip(self))]
@@ -577,7 +374,7 @@ impl AttachmentService {
) -> anyhow::Result<TenantShardMigrateResponse> {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
format!("tenant/{tenant_shard_id}/migrate"),
Some(TenantShardMigrateRequest {
tenant_shard_id,
node_id,
@@ -586,23 +383,9 @@ impl AttachmentService {
.await
}
#[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
pub async fn tenant_split(
&self,
tenant_id: TenantId,
new_shard_count: u8,
) -> anyhow::Result<TenantShardSplitResponse> {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/shard_split"),
Some(TenantShardSplitRequest { new_shard_count }),
)
.await
}
#[instrument(skip_all, fields(node_id=%req.node_id))]
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
.await
}
@@ -610,7 +393,7 @@ impl AttachmentService {
pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
self.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/node/{}/config", req.node_id),
format!("node/{}/config", req.node_id),
Some(req),
)
.await
@@ -630,7 +413,7 @@ impl AttachmentService {
) -> anyhow::Result<TimelineInfo> {
self.dispatch(
Method::POST,
format!("v1/tenant/{tenant_id}/timeline"),
format!("tenant/{tenant_id}/timeline"),
Some(req),
)
.await

View File

@@ -17,7 +17,7 @@ use std::io::Write;
use std::os::unix::prelude::AsRawFd;
use std::os::unix::process::CommandExt;
use std::path::Path;
use std::process::Command;
use std::process::{Child, Command};
use std::time::Duration;
use std::{fs, io, thread};
@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
envs: EI,
initial_pid_file: InitialPidFile,
process_status_check: F,
) -> anyhow::Result<()>
) -> anyhow::Result<Child>
where
F: Fn() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<bool>>,
@@ -72,6 +72,7 @@ where
let log_path = datadir.join(format!("{process_name}.log"));
let process_log_file = fs::OpenOptions::new()
.create(true)
.write(true)
.append(true)
.open(&log_path)
.with_context(|| {
@@ -97,7 +98,7 @@ where
InitialPidFile::Expect(path) => path,
};
let spawned_process = filled_cmd.spawn().with_context(|| {
let mut spawned_process = filled_cmd.spawn().with_context(|| {
format!("Could not spawn {process_name}, see console output and log files for details.")
})?;
let pid = spawned_process.id();
@@ -105,26 +106,12 @@ where
i32::try_from(pid)
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
);
// set up a scopeguard to kill & wait for the child in case we panic or bail below
let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
println!("SIGKILL & wait the started process");
(|| {
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
spawned_process.kill().context("SIGKILL child")?;
spawned_process.wait().context("wait() for child process")?;
anyhow::Ok(())
})()
.with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
.unwrap();
});
for retries in 0..RETRIES {
match process_started(pid, pid_file_to_check, &process_status_check).await {
Ok(true) => {
println!("\n{process_name} started and passed status check, pid: {pid}");
// leak the child process, it'll outlive this neon_local invocation
drop(scopeguard::ScopeGuard::into_inner(spawned_process));
return Ok(());
println!("\n{process_name} started, pid: {pid}");
return Ok(spawned_process);
}
Ok(false) => {
if retries == NOTICE_AFTER_RETRIES {
@@ -139,15 +126,16 @@ where
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
}
Err(e) => {
println!("error starting process {process_name:?}: {e:#}");
println!("{process_name} failed to start: {e:#}");
if let Err(e) = spawned_process.kill() {
println!("Could not stop {process_name} subprocess: {e:#}")
};
return Err(e);
}
}
}
println!();
anyhow::bail!(
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
);
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -255,9 +243,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
for env_key in [
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_PROFILE",
// HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
"HOME",
"AWS_SESSION_TOKEN",
"AZURE_STORAGE_ACCOUNT",
"AZURE_STORAGE_ACCESS_KEY",
] {

View File

@@ -51,7 +51,7 @@ project_git_version!(GIT_VERSION);
const DEFAULT_PG_VERSION: &str = "15";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
fn default_conf(num_pageservers: u16) -> String {
let mut template = format!(
@@ -135,7 +135,7 @@ fn main() -> Result<()> {
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(sub_args, &env)),
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
"stop" => handle_stop_all(sub_args, &env),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
@@ -450,7 +450,7 @@ async fn handle_tenant(
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: None,
shard_parameters: ShardParameters {
count: ShardCount::new(shard_count),
count: ShardCount(shard_count),
stripe_size: shard_stripe_size
.map(ShardStripeSize)
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
@@ -575,26 +575,6 @@ async fn handle_tenant(
println!("{tenant_table}");
println!("{shard_table}");
}
Some(("shard-split", matches)) => {
let tenant_id = get_tenant_id(matches, env)?;
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
let attachment_service = AttachmentService::from_env(env);
let result = attachment_service
.tenant_split(tenant_id, shard_count)
.await?;
println!(
"Split tenant {} into shards {}",
tenant_id,
result
.new_shards
.iter()
.map(|s| format!("{:?}", s))
.collect::<Vec<_>>()
.join(",")
);
}
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
None => bail!("no tenant subcommand provided"),
}
@@ -815,7 +795,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
&endpoint.timeline_id.to_string(),
branch_name,
lsn_str.as_str(),
&format!("{}", endpoint.status()),
endpoint.status(),
]);
}
@@ -1014,13 +994,12 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?;
let destroy = sub_args.get_flag("destroy");
let mode = sub_args.get_one::<String>("mode").expect("has a default");
let endpoint = cplane
.endpoints
.get(endpoint_id.as_str())
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
endpoint.stop(mode, destroy)?;
endpoint.stop(destroy)?;
}
_ => bail!("Unexpected endpoint subcommand '{sub_name}'"),
@@ -1077,9 +1056,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
if let Err(e) = get_pageserver(env, subcommand_args)?
.start(&pageserver_config_overrides(subcommand_args), *register)
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
@@ -1108,7 +1086,24 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
}
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args), false)
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
}
}
Some(("migrate", subcommand_args)) => {
let pageserver = get_pageserver(env, subcommand_args)?;
//TODO what shutdown strategy should we use here?
if let Err(e) = pageserver.stop(false) {
eprintln!("pageserver stop failed: {}", e);
exit(1);
}
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
@@ -1166,7 +1161,7 @@ async fn handle_attachment_service(
.map(|s| s.as_str())
== Some("immediate");
if let Err(e) = svc.stop(immediate).await {
if let Err(e) = svc.stop(immediate) {
eprintln!("stop failed: {}", e);
exit(1);
}
@@ -1262,7 +1257,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
let attachment_service = AttachmentService::from_env(env);
if let Err(e) = attachment_service.start().await {
eprintln!("attachment_service start failed: {:#}", e);
try_stop_all(env, true).await;
try_stop_all(env, true);
exit(1);
}
}
@@ -1270,11 +1265,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver
.start(&pageserver_config_overrides(sub_match), true)
.start(&pageserver_config_overrides(sub_match))
.await
{
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true).await;
try_stop_all(env, true);
exit(1);
}
}
@@ -1283,28 +1278,28 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start(vec![]).await {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false).await;
try_stop_all(env, false);
exit(1);
}
}
Ok(())
}
async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let immediate =
sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
try_stop_all(env, immediate).await;
try_stop_all(env, immediate);
Ok(())
}
async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
// Stop all endpoints
match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => {
for (_k, node) in cplane.endpoints {
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
if let Err(e) = node.stop(false) {
eprintln!("postgres stop failed: {e:#}");
}
}
@@ -1334,7 +1329,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
if env.control_plane_api.is_some() {
let attachment_service = AttachmentService::from_env(env);
if let Err(e) = attachment_service.stop(immediate).await {
if let Err(e) = attachment_service.stop(immediate) {
eprintln!("attachment service stop failed: {e:#}");
}
}
@@ -1545,11 +1540,6 @@ fn cli() -> Command {
.subcommand(Command::new("status")
.about("Human readable summary of the tenant's shards and attachment locations")
.arg(tenant_id_arg.clone()))
.subcommand(Command::new("shard-split")
.about("Increase the number of shards in the tenant")
.arg(tenant_id_arg.clone())
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
)
)
.subcommand(
Command::new("pageserver")
@@ -1559,11 +1549,7 @@ fn cli() -> Command {
.subcommand(Command::new("status"))
.subcommand(Command::new("start")
.about("Start local pageserver")
.arg(pageserver_config_args.clone()).arg(Arg::new("register")
.long("register")
.default_value("true").required(false)
.value_parser(value_parser!(bool))
.value_name("register"))
.arg(pageserver_config_args.clone())
)
.subcommand(Command::new("stop")
.about("Stop local pageserver")
@@ -1653,16 +1639,7 @@ fn cli() -> Command {
.long("destroy")
.action(ArgAction::SetTrue)
.required(false)
)
.arg(
Arg::new("mode")
.help("Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")
.long("mode")
.action(ArgAction::Set)
.required(false)
.value_parser(["smart", "fast", "immediate"])
.default_value("fast")
)
)
)
)

View File

@@ -57,7 +57,7 @@ use crate::local_env::LocalEnv;
use crate::postgresql_conf::PostgresConf;
use compute_api::responses::{ComputeState, ComputeStatus};
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
// contents of a endpoint.json file
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -70,7 +70,6 @@ pub struct EndpointConf {
http_port: u16,
pg_version: u32,
skip_pg_catalog_updates: bool,
features: Vec<ComputeFeature>,
}
//
@@ -141,7 +140,6 @@ impl ComputeControlPlane {
// with this we basically test a case of waking up an idle compute, where
// we also skip catalog updates in the cloud.
skip_pg_catalog_updates: true,
features: vec![],
});
ep.create_endpoint_dir()?;
@@ -156,7 +154,6 @@ impl ComputeControlPlane {
pg_port,
pg_version,
skip_pg_catalog_updates: true,
features: vec![],
})?,
)?;
std::fs::write(
@@ -184,7 +181,7 @@ impl ComputeControlPlane {
v.tenant_id == tenant_id
&& v.timeline_id == timeline_id
&& v.mode == mode
&& v.status() != EndpointStatus::Stopped
&& v.status() != "stopped"
});
if let Some((key, _)) = duplicates.next() {
@@ -218,29 +215,6 @@ pub struct Endpoint {
// Optimizations
skip_pg_catalog_updates: bool,
// Feature flags
features: Vec<ComputeFeature>,
}
#[derive(PartialEq, Eq)]
pub enum EndpointStatus {
Running,
Stopped,
Crashed,
RunningNoPidfile,
}
impl std::fmt::Display for EndpointStatus {
fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
let s = match self {
Self::Running => "running",
Self::Stopped => "stopped",
Self::Crashed => "crashed",
Self::RunningNoPidfile => "running, no pidfile",
};
write!(writer, "{}", s)
}
}
impl Endpoint {
@@ -270,7 +244,6 @@ impl Endpoint {
tenant_id: conf.tenant_id,
pg_version: conf.pg_version,
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
features: conf.features,
})
}
@@ -400,16 +373,16 @@ impl Endpoint {
self.endpoint_path().join("pgdata")
}
pub fn status(&self) -> EndpointStatus {
pub fn status(&self) -> &str {
let timeout = Duration::from_millis(300);
let has_pidfile = self.pgdata().join("postmaster.pid").exists();
let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();
match (has_pidfile, can_connect) {
(true, true) => EndpointStatus::Running,
(false, false) => EndpointStatus::Stopped,
(true, false) => EndpointStatus::Crashed,
(false, true) => EndpointStatus::RunningNoPidfile,
(true, true) => "running",
(false, false) => "stopped",
(true, false) => "crashed",
(false, true) => "running, no pidfile",
}
}
@@ -458,7 +431,7 @@ impl Endpoint {
}
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
// TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
// TODO use background_process::stop_process instead
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
let pid = nix::unistd::Pid::from_raw(pid as i32);
@@ -501,7 +474,7 @@ impl Endpoint {
remote_ext_config: Option<&String>,
shard_stripe_size: usize,
) -> Result<()> {
if self.status() == EndpointStatus::Running {
if self.status() == "running" {
anyhow::bail!("The endpoint is already running");
}
@@ -546,7 +519,7 @@ impl Endpoint {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
features: self.features.clone(),
features: vec![],
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
@@ -603,21 +576,9 @@ impl Endpoint {
}
let child = cmd.spawn()?;
// set up a scopeguard to kill & wait for the child in case we panic or bail below
let child = scopeguard::guard(child, |mut child| {
println!("SIGKILL & wait the started process");
(|| {
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned
child.kill().context("SIGKILL child")?;
child.wait().context("wait() for child process")?;
anyhow::Ok(())
})()
.with_context(|| format!("scopeguard kill&wait child {child:?}"))
.unwrap();
});
// Write down the pid so we can wait for it when we want to stop
// TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
// TODO use background_process::start_process instead
let pid = child.id();
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
std::fs::write(pidfile_path, pid.to_string())?;
@@ -666,9 +627,6 @@ impl Endpoint {
std::thread::sleep(ATTEMPT_INTERVAL);
}
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
drop(scopeguard::ScopeGuard::into_inner(child));
Ok(())
}
@@ -761,8 +719,22 @@ impl Endpoint {
}
}
pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> {
self.pg_ctl(&["-m", mode, "stop"], &None)?;
pub fn stop(&self, destroy: bool) -> Result<()> {
// If we are going to destroy data directory,
// use immediate shutdown mode, otherwise,
// shutdown gracefully to leave the data directory sane.
//
// Postgres is always started from scratch, so stop
// without destroy only used for testing and debugging.
//
self.pg_ctl(
if destroy {
&["-m", "immediate", "stop"]
} else {
&["stop"]
},
&None,
)?;
// Also wait for the compute_ctl process to die. It might have some
// cleanup work to do after postgres stops, like syncing safekeepers,

View File

@@ -72,16 +72,11 @@ pub struct LocalEnv {
#[serde(default)]
pub safekeepers: Vec<SafekeeperConf>,
// Control plane upcall API for pageserver: if None, we will not run attachment_service. If set, this will
// Control plane location: if None, we will not run attachment_service. If set, this will
// be propagated into each pageserver's configuration.
#[serde(default)]
pub control_plane_api: Option<Url>,
// Control plane upcall API for attachment service. If set, this will be propagated into the
// attachment service's configuration.
#[serde(default)]
pub control_plane_compute_hook_api: Option<Url>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
#[serde(default)]
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
@@ -228,11 +223,7 @@ impl LocalEnv {
}
pub fn attachment_service_bin(&self) -> PathBuf {
// Irrespective of configuration, attachment service binary is always
// run from the same location as neon_local. This means that for compatibility
// tests that run old pageserver/safekeeper, they still run latest attachment service.
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
neon_local_bin_dir.join("attachment_service")
self.neon_distrib_dir.join("attachment_service")
}
pub fn safekeeper_bin(&self) -> PathBuf {

View File

@@ -11,7 +11,7 @@ use std::io;
use std::io::Write;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::Command;
use std::process::{Child, Command};
use std::time::Duration;
use anyhow::{bail, Context};
@@ -30,7 +30,6 @@ use utils::{
lsn::Lsn,
};
use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
use crate::local_env::PageServerConf;
use crate::{background_process, local_env::LocalEnv};
@@ -162,8 +161,8 @@ impl PageServerNode {
.expect("non-Unicode path")
}
pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
self.start_node(config_overrides, false, register).await
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
self.start_node(config_overrides, false).await
}
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -208,8 +207,7 @@ impl PageServerNode {
&self,
config_overrides: &[&str],
update_config: bool,
register: bool,
) -> anyhow::Result<()> {
) -> anyhow::Result<Child> {
// TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path();
print!(
@@ -246,26 +244,7 @@ impl PageServerNode {
}
},
)
.await?;
if register {
let attachment_service = AttachmentService::from_env(&self.env);
let (pg_host, pg_port) =
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
attachment_service
.node_register(NodeRegisterRequest {
node_id: self.conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
}
Ok(())
.await
}
fn pageserver_basic_args<'a>(
@@ -395,16 +374,6 @@ impl PageServerNode {
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'lazy_slru_download' as bool")?,
timeline_get_throttle: settings
.remove("timeline_get_throttle")
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -505,16 +474,6 @@ impl PageServerNode {
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'lazy_slru_download' as bool")?,
timeline_get_throttle: settings
.remove("timeline_get_throttle")
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
}
};

View File

@@ -7,6 +7,7 @@
//! ```
use std::io::Write;
use std::path::PathBuf;
use std::process::Child;
use std::{io, result};
use anyhow::Context;
@@ -103,7 +104,7 @@ impl SafekeeperNode {
.expect("non-Unicode path")
}
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
print!(
"Starting safekeeper at '{}' in '{}'",
self.pg_connection_config.raw_address(),

View File

@@ -1,9 +0,0 @@
# For documentation on how to configure this file,
# see https://diesel.rs/guides/configuring-diesel-cli
[print_schema]
file = "control_plane/attachment_service/src/schema.rs"
custom_type_derives = ["diesel::query_builder::QueryId"]
[migrations_directory]
dir = "control_plane/attachment_service/migrations"

View File

@@ -21,7 +21,7 @@ We build all images after a successful `release` tests run and push automaticall
## Docker Compose example
You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following containers.
You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
- pageserver x 1
- safekeeper x 3
@@ -38,7 +38,7 @@ You can specify version of neon cluster using following environment values.
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
```
$ cd docker-compose/
$ docker-compose down # remove the containers if exists
$ docker-compose down # remove the conainers if exists
$ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version
Creating network "dockercompose_default" with the default driver
Creating docker-compose_storage_broker_1 ... done

View File

@@ -64,7 +64,7 @@ Storage.
The LayerMap tracks what layers exist in a timeline.
Currently, the layer map is just a resizable array (Vec). On a GetPage@LSN or
Currently, the layer map is just a resizeable array (Vec). On a GetPage@LSN or
other read request, the layer map scans through the array to find the right layer
that contains the data for the requested page. The read-code in LayeredTimeline
is aware of the ancestor, and returns data from the ancestor timeline if it's

View File

@@ -22,7 +22,7 @@ timeline to shutdown. It will also wait for them to finish.
A task registered in the task registry can check if it has been
requested to shut down, by calling `is_shutdown_requested()`. There's
also a `shutdown_watcher()` Future that can be used with `tokio::select!`
also a `shudown_watcher()` Future that can be used with `tokio::select!`
or similar, to wake up on shutdown.

View File

@@ -74,4 +74,4 @@ somewhat wasteful, but because most WAL records only affect one page,
the overhead is acceptable.
The WAL redo always happens for one particular page. If the WAL record
contains changes to other pages, they are ignored.
coantains changes to other pages, they are ignored.

View File

@@ -1,420 +0,0 @@
# Splitting cloud console
Created on 17.06.2022
## Summary
Currently we have `cloud` repository that contains code implementing public API for our clients as well as code for managing storage and internal infrastructure services. We can split everything user-related from everything storage-related to make it easier to test and maintain.
This RFC proposes to introduce a new control-plane service with HTTP API. The overall architecture will look like this:
```markup
. x
external area x internal area
(our clients) x (our services)
x
x ┌───────────────────────┐
x ┌───────────────┐ > ┌─────────────────────┐ │ Storage (EC2) │
x │ console db │ > │ control-plane db │ │ │
x └───────────────┘ > └─────────────────────┘ │ - safekeepers │
x ▲ > ▲ │ - pageservers │
x │ > │ │ │
┌──────────────────┐ x ┌───────┴───────┐ > │ │ Dependencies │
│ browser UI ├──►│ │ > ┌──────────┴──────────┐ │ │
└──────────────────┘ x │ │ > │ │ │ - etcd │
x │ console ├───────►│ control-plane ├────►│ - S3 │
┌──────────────────┐ x │ │ > │ (deployed in k8s) │ │ - more? │
│public API clients├──►│ │ > │ │ │ │
└──────────────────┘ x └───────┬───────┘ > └──────────┬──────────┘ └───────────────────────┘
x │ > ▲ │ ▲
x │ > │ │ │
x ┌───────┴───────┐ > │ │ ┌───────────┴───────────┐
x │ dependencies │ > │ │ │ │
x │- analytics │ > │ └───────────────►│ computes │
x │- auth │ > │ │ (deployed in k8s) │
x │- billing │ > │ │ │
x └───────────────┘ > │ └───────────────────────┘
x > │ ▲
x > ┌─────┴───────────────┐ │
┌──────────────────┐ x > │ │ │
│ │ x > │ proxy ├─────────────────┘
│ postgres ├───────────────────────────►│ (deployed in k8s) │
│ users │ x > │ │
│ │ x > └─────────────────────┘
└──────────────────┘ x >
>
>
closed-source > open-source
>
>
```
Notes:
- diagram is simplified in the less-important places
- directed arrows are strict and mean that connections in the reverse direction are forbidden
This split is quite complex and this RFC proposes several smaller steps to achieve the larger goal:
1. Start by refactoring the console code, the goal is to have console and control-plane code in the different directories without dependencies on each other.
2. Do similar refactoring for tables in the console database, remove queries selecting data from both console and control-plane; move control-plane tables to a separate database.
3. Implement control-plane HTTP API serving on a separate TCP port; make all console→control-plane calls to go through that HTTP API.
4. Move control-plane source code to the neon repo; start control-plane as a separate service.
## Motivation
These are the two most important problems we want to solve:
- Publish open-source implementation of all our cloud/storage features
- Make a unified control-plane that is used in all cloud (serverless) and local (tests) setups
Right now we have some closed-source code in the cloud repo. That code contains implementation for running Neon computes in k8s and without that code its impossible to automatically scale PostgreSQL computes. That means that we dont have an open-source serverless PostgreSQL at the moment.
After splitting and open-sourcing control-plane service we will have source code and Docker images for all storage services. That control-plane service should have HTTP API for creating and managing tenants (including all our storage features), while proxy will listen for incoming connections and create computes on-demand.
Improving our test suite is an important task, but requires a lot of prerequisites and may require a separate RFC. Possible implementation of that is described in the section [Next steps](#next-steps).
Another piece of motivation can be a better involvement of storage development team into a control-plane. By splitting control-plane from the console, it can be more convenient to test and develop control-plane with paying less attention to “business” features, such as user management, billing and analytics.
For example, console currently requires authentication providers such as GitHub OAuth to work at all, as well as nodejs to be able to build it locally. It will be more convenient to build and run it locally without these requirements.
## Proposed implementation
### Current state of things
Lets start with defining the current state of things at the moment of this proposal. We have three repositories containing source code:
- open-source `postgres` — our fork of postgres
- open-source `neon` — our main repository for storage source code
- closed-source `cloud` — mostly console backend and UI frontend
This proposal aims not to change anything at the existing code in `neon` and `postgres` repositories, but to create control-plane service and move its source code from `cloud` to the `neon` repository. That means that we need to split code in `cloud` repo only, and will consider only this repository for exploring its source code.
Lets look at the miscellaneous things in the `cloud` repo which are NOT part of the console application, i.e. NOT the Go source code that is compiled to the `./console` binary. There we have:
- command-line tools, such as cloudbench, neonadmin
- markdown documentation
- cloud operations scripts (helm, terraform, ansible)
- configs and other things
- e2e python tests
- incidents playbooks
- UI frontend
- Make build scripts, code generation scripts
- database migrations
- swagger definitions
And also lets take a look at what we have in the console source code, which is the service wed like to split:
- API Servers
- Public API v2
- Management API v2
- Public API v1
- Admin API v1 (same port as Public API v1)
- Management API v1
- Workers
- Monitor Compute Activity
- Watch Failed Operations
- Availability Checker
- Business Metrics Collector
- Internal Services
- Auth Middleware, UserIsAdmin, Cookies
- Cable Websocket Server
- Admin Services
- Global Settings, Operations, Pageservers, Platforms, Projects, Safekeepers, Users
- Authenticate Proxy
- API Keys
- App Controller, serving UI HTML
- Auth Controller
- Branches
- Projects
- Psql Connect + Passwordless login
- Users
- Cloud Metrics
- User Metrics
- Invites
- Pageserver/Safekeeper management
- Operations, k8s/docker/common logic
- Platforms, Regions
- Project State
- Projects Roles, SCRAM
- Global Settings
- Other things
- segment analytics integration
- sentry integration
- other common utilities packages
### Drawing the splitting line
The most challenging and the most important thing is to define the line that will split new control-plane service from the existing cloud service. If we dont get it right, then we can end up with having a lot more issues without many benefits.
We propose to define that line as follows:
- everything user-related stays in the console service
- everything storage-related should be in the control-plane service
- something that falls in between should be decided where to go, but most likely should stay in the console service
- some similar parts should be in both services, such as admin/management/db_migrations
We call user-related all requests that can be connected to some user. The general idea is dont have any user_id in the control-plane service and operate exclusively on tenant_id+timeline_id, the same way as existing storage services work now (compute, safekeeper, pageserver).
Storage-related things can be defined as doing any of the following:
- using k8s API
- doing requests to any of the storage services (proxy, compute, safekeeper, pageserver, etc..)
- tracking current status of tenants/timelines, managing lifetime of computes
Based on that idea, we can say that new control-plane service should have the following components:
- single HTTP API for everything
- Create and manage tenants and timelines
- Manage global settings and storage configuration (regions, platforms, safekeepers, pageservers)
- Admin API for storage health inspection and debugging
- Workers
- Monitor Compute Activity
- Watch Failed Operations
- Availability Checker
- Internal Services
- Admin Services
- Global Settings, Operations, Pageservers, Platforms, Tenants, Safekeepers
- Authenticate Proxy
- Branches
- Psql Connect
- Cloud Metrics
- Pageserver/Safekeeper management
- Operations, k8s/docker/common logic
- Platforms, Regions
- Tenant State
- Compute Roles, SCRAM
- Global Settings
---
And other components should probably stay in the console service:
- API Servers (no changes here)
- Public API v2
- Management API v2
- Public API v1
- Admin API v1 (same port as Public API v1)
- Management API v1
- Workers
- Business Metrics Collector
- Internal Services
- Auth Middleware, UserIsAdmin, Cookies
- Cable Websocket Server
- Admin Services
- Users admin stays the same
- Other admin services can redirect requests to the control-plane
- API Keys
- App Controller, serving UI HTML
- Auth Controller
- Projects
- User Metrics
- Invites
- Users
- Passwordless login
- Other things
- segment analytics integration
- sentry integration
- other common utilities packages
There are also miscellaneous things that are useful for all kinds of services. So we can say that these things can be in both services:
- markdown documentation
- e2e python tests
- make build scripts, code generation scripts
- database migrations
- swagger definitions
The single entrypoint to the storage should be control-plane API. After we define that API, we can have code-generated implementation for the client and for the server. The general idea is to move code implementing storage components from the console to the API implementation inside the new control-plane service.
After the code is moved to the new service, we can fill the created void by making API calls to the new service:
- authorization of the client
- mapping user_id + project_id to the tenant_id
- calling the control-plane API
### control-plane API
Currently we have the following projects API in the console:
```
GET /projects/{project_id}
PATCH /projects/{project_id}
POST /projects/{project_id}/branches
GET /projects/{project_id}/databases
POST /projects/{project_id}/databases
GET /projects/{project_id}/databases/{database_id}
PUT /projects/{project_id}/databases/{database_id}
DELETE /projects/{project_id}/databases/{database_id}
POST /projects/{project_id}/delete
GET /projects/{project_id}/issue_token
GET /projects/{project_id}/operations
GET /projects/{project_id}/operations/{operation_id}
POST /projects/{project_id}/query
GET /projects/{project_id}/roles
POST /projects/{project_id}/roles
GET /projects/{project_id}/roles/{role_name}
DELETE /projects/{project_id}/roles/{role_name}
POST /projects/{project_id}/roles/{role_name}/reset_password
POST /projects/{project_id}/start
POST /projects/{project_id}/stop
POST /psql_session/{psql_session_id}
```
It looks fine and we probably already have clients relying on it. So we should not change it, at least for now. But most of these endpoints (if not all) are related to storage, and it can suggest us what control-plane API should look like:
```
GET /tenants/{tenant_id}
PATCH /tenants/{tenant_id}
POST /tenants/{tenant_id}/branches
GET /tenants/{tenant_id}/databases
POST /tenants/{tenant_id}/databases
GET /tenants/{tenant_id}/databases/{database_id}
PUT /tenants/{tenant_id}/databases/{database_id}
DELETE /tenants/{tenant_id}/databases/{database_id}
POST /tenants/{tenant_id}/delete
GET /tenants/{tenant_id}/issue_token
GET /tenants/{tenant_id}/operations
GET /tenants/{tenant_id}/operations/{operation_id}
POST /tenants/{tenant_id}/query
GET /tenants/{tenant_id}/roles
POST /tenants/{tenant_id}/roles
GET /tenants/{tenant_id}/roles/{role_name}
DELETE /tenants/{tenant_id}/roles/{role_name}
POST /tenants/{tenant_id}/roles/{role_name}/reset_password
POST /tenants/{tenant_id}/start
POST /tenants/{tenant_id}/stop
POST /psql_session/{psql_session_id}
```
One of the options here is to use gRPC instead of the HTTP, which has some useful features, but there are some strong points towards using plain HTTP:
- HTTP API is easier to use for the clients
- we already have HTTP API in pageserver/safekeeper/console
- we probably want control-plane API to be similar to the console API, available in the cloud
### Getting updates from the storage
There can be some valid cases, when we would like to know what is changed in the storage. For example, console might want to know when user has queried and started compute and when compute was scaled to zero after that, to know how much user should pay for the service. Another example is to get info about reaching the disk space limits. Yet another example is to do analytics, such as how many users had at least one active project in a month.
All of the above cases can happen without using the console, just by accessing compute through the proxy.
To solve this, we can have a log of events occurring in the storage (event logs). That is very similar to operations table we have right now, the only difference is that events are immutable and we cannot change them after saving to the database. For example, we might want to have events for the following activities:
- We finished processing some HTTP API query, such as resetting the password
- We changed some state, such as started or stopped a compute
- Operation is created
- Operation is started for the first time
- Operation is failed for the first time
- Operation is finished
Once we save these events to the database, we can create HTTP API to subscribe to these events. That API can look like this:
```
GET /events/<cursor>
{
"events": [...],
"next_cursor": 123
}
```
It should be possible to replay event logs from some point of time, to get a state of almost anything from the storage services. That means that if we maintain some state in the control-plane database and we have a reason to have the same state in the console database, it is possible by polling events from the control-plane API and changing the state in the console database according to the events.
### Next steps
After implementing control-plane HTTP API and starting control-plane as a separate service, we might want to think of exploiting benefits of the new architecture, such as reorganizing test infrastructure. Possible options are listed in the [Next steps](#next-steps-1).
## Non Goals
RFC doesnt cover the actual cloud deployment scripts and schemas, such as terraform, ansible, k8s yamls and so on.
## Impacted components
Mostly console, but can also affect some storage service.
## Scalability
We should support starting several instances of the new control-plane service at the same time.
At the same time, it should be possible to use only single instance of control-plane, which can be useful for local tests.
## Security implications
New control-plane service is an internal service, so no external requests can reach it. But at the same time, it contains API to do absolutely anything with any of the tenants. That means that bad internal actor can potentially read and write all of the tenants. To make this safer, we can have one of these:
- Simple option is to protect all requests with a single private key, so that no one can make requests without having that one key.
- Another option is to have a separate token for every tenant and store these tokens in another secure place. This way its harder to access all tenants at once, because they have the different tokens.
## Alternative implementation
There was an idea to create a k8s operator for managing storage services and computes, but author of this RFC is not really familiar with it.
Regarding less alternative ideas, there are another options for the name of the new control-plane service:
- storage-ctl
- cloud
- cloud-ctl
## Pros/cons of proposed approaches (TODO)
Pros:
- All storage features are completely open-source
- Better tests coverage, less difference between cloud and local setups
- Easier to develop storage and cloud features, because there is no need to setup console for that
- Easier to deploy storage-only services to the any cloud
Cons:
- All storage features are completely open-source
- Distributed services mean more code to connect different services and potential network issues
- Console needs to have a dependency on storage API, there can be complications with developing new feature in a branch
- More code to JOIN data from different services (console and control-plane)
## Definition of Done
We have a new control-plane service running in the k8s. Source code for that control-plane service is located in the open-source neon repo.
## Next steps
After weve reached DoD, we can make further improvements.
First thing that can benefit from the split is local testing. The same control-plane service can implement starting computes as a local processes instead of k8s deployments. If it will also support starting pageservers/safekeepers/proxy for the local setup, then it can completely replace `./neon_local` binary, which is currently used for testing. The local testing environment can look like this:
```
┌─────────────────────┐ ┌───────────────────────┐
│ │ │ Storage (local) │
│ control-plane db │ │ │
│ (local process) │ │ - safekeepers │
│ │ │ - pageservers │
└──────────▲──────────┘ │ │
│ │ Dependencies │
┌──────────┴──────────┐ │ │
│ │ │ - etcd │
│ control-plane ├────►│ - S3 │
│ (local process) │ │ - more? │
│ │ │ │
└──────────┬──────────┘ └───────────────────────┘
▲ │ ▲
│ │ │
│ │ ┌───────────┴───────────┐
│ │ │ │
│ └───────────────►│ computes │
│ │ (local processes) │
│ │ │
┌──────┴──────────────┐ └───────────────────────┘
│ │ ▲
│ proxy │ │
│ (local process) ├─────────────────┘
│ │
└─────────────────────┘
```
The key thing here is that control-plane local service have the same API and almost the same implementation as the one deployed in the k8s. This allows to run the same e2e tests against both cloud and local setups.
For the python test_runner tests everything can stay mostly the same. To do that, we just need to replace `./neon_local` cli commands with API calls to the control-plane.
The benefit here will be in having fast local tests that are really close to our cloud setup. Bugs in k8s queries are still cannot be found when running computes as a local processes, but it should be really easy to start k8s locally (for example in k3s) and run the same tests with control-plane connected to the local k8s.
Talking about console and UI tests, after the split there should be a way to test these without spinning up all the storage locally. New control-plane service has a well-defined API, allowing us to mock it. This way we can create UI tests to verify the right calls are issued after specific UI interactions and verify that we render correct messages when API returns errors.

View File

@@ -78,7 +78,7 @@ with grpc streams and tokio mpsc channels. The implementation description is at
It is just 500 lines of code and core functionality is complete. 1-1 pub sub
gives about 120k received messages per second; having multiple subscribers in
different connections quickly scales to 1 million received messages per second.
different connecitons quickly scales to 1 million received messages per second.
I had concerns about many concurrent streams in singe connection, but 2^20
subscribers still work (though eat memory, with 10 publishers 20GB are consumed;
in this implementation each publisher holds full copy of all subscribers). There
@@ -95,12 +95,12 @@ other members, with best-effort this is simple.
### Security implications
Communication happens in a private network that is not exposed to users;
additionally we can add auth to the broker.
additionaly we can add auth to the broker.
## Alternative: get existing pub-sub
We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this
case IMV simplicity of our own outweighs external dependency costs (RabbitMQ is
case IMV simplicity of our own outweights external dependency costs (RabbitMQ is
much more complicated and needs VM; Redis Rust client maintenance is not
ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC
as well.

View File

@@ -74,7 +74,7 @@ TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the
tenant is not in Active state. Used for operations like attach/detach. Perhaps
allow only one such guard on a Tenant at a time.
Similarly for Timelines. We don't currently have a "state" on Timeline, but I think
Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think
we need at least two states: Active and Stopping. The Stopping state is used at
deletion, to prevent new TimelineActiveGuards from appearing, while you wait for
existing TimelineActiveGuards to die out.
@@ -85,7 +85,7 @@ have a TenantActiveGuard, and the tenant's state changes from Active to
Stopping, the is_shutdown_requested() function should return true, and
shutdown_watcher() future should return.
This signaling doesn't necessarily need to cover all cases. For example, if you
This signaling doesn't neessarily need to cover all cases. For example, if you
have a block of code in spawn_blocking(), it might be acceptable if
is_shutdown_requested() doesn't return true even though the tenant is in
Stopping state, as long as the code finishes reasonably fast.

View File

@@ -37,7 +37,7 @@ sequenceDiagram
```
At this point it is not possible to restore from index, it contains L2 which
is no longer available in s3 and doesn't contain L3 added by compaction by the
is no longer available in s3 and doesnt contain L3 added by compaction by the
first pageserver. So if any of the pageservers restart initial sync will fail
(or in on-demand world it will fail a bit later during page request from
missing layer)
@@ -74,7 +74,7 @@ One possible solution for relocation case is to orchestrate background jobs
from outside. The oracle who runs migration can turn off background jobs on
PS1 before migration and then run migration -> enable them on PS2. The problem
comes if migration fails. In this case in order to resume background jobs
oracle needs to guarantee that PS2 doesn't run background jobs and if it doesn't
oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
without human ensuring that no upload from PS2 can happen. In order to be able
to resolve this automatically CAS is required on S3 side so pageserver can
@@ -128,7 +128,7 @@ During discussion it seems that we converged on the approach consisting of:
whether we need to apply change to the index state or not.
- Responsibility for running background jobs is assigned externally. Pageserver
keeps locally persistent flag for each tenant that indicates whether this
pageserver is considered as primary one or not. TODO what happens if we
pageserver is considered as primary one or not. TODO what happends if we
crash and cannot start for some extended period of time? Control plane can
assign ownership to some other pageserver. Pageserver needs some way to check
if its still the blessed one. Maybe by explicit request to control plane on
@@ -138,7 +138,7 @@ Requirement for deterministic layer generation was considered overly strict
because of two reasons:
- It can limit possible optimizations e g when pageserver wants to reshuffle
some data locally and doesn't want to coordinate this
some data locally and doesnt want to coordinate this
- The deterministic algorithm itself can change so during deployments for some
time there will be two different version running at the same time which can
cause non determinism
@@ -164,7 +164,7 @@ sequenceDiagram
CP->>PS1: Yes
deactivate CP
PS1->>S3: Fetch PS1 index.
note over PS1: Continue operations, start background jobs
note over PS1: Continue operations, start backround jobs
note over PS1,PS2: PS1 starts up and still and is not a leader anymore
PS1->>CP: Am I still the leader for Tenant X?
CP->>PS1: No
@@ -203,7 +203,7 @@ sequenceDiagram
### Eviction
When two pageservers operate on a tenant for extended period of time follower
doesn't perform write operations in s3. When layer is evicted follower relies
doesnt perform write operations in s3. When layer is evicted follower relies
on updates from primary to get info about layers it needs to cover range for
evicted layer.

View File

@@ -4,7 +4,7 @@ Created on 08.03.23
## Motivation
Currently we don't delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC).
Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC).
This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident)
@@ -75,9 +75,9 @@ Remote one is needed for cases when pageserver is lost during deletion so other
Why local mark file is needed?
If we don't have one, we have two choices, delete local data before deleting the remote part or do that after.
If we dont have one, we have two choices, delete local data before deleting the remote part or do that after.
If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote counterparts of locally available tenants).
If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants).
If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote.
@@ -145,7 +145,7 @@ sequenceDiagram
CP->>PS: Retry delete tenant
PS->>CP: Not modified
else Mark is missing
note over PS: Continue to operate the tenant as if deletion didn't happen
note over PS: Continue to operate the tenant as if deletion didnt happen
note over CP: Eventually console should <br> retry delete request
@@ -168,7 +168,7 @@ sequenceDiagram
PS->>CP: True
```
Similar sequence applies when both local and remote marks were persisted but Control Plane still didn't receive a response.
Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response.
If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success.
@@ -187,7 +187,7 @@ If pageseserver is lost then the deleted tenant should be attached to different
##### Restrictions for tenant that is in progress of being deleted
I propose to add another state to tenant/timeline - PendingDelete. This state shouldn't allow executing any operations aside from polling the deletion status.
I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status.
#### Summary
@@ -237,7 +237,7 @@ New branch gets created
PS1 starts up (is it possible or we just recycle it?)
PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane.
So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage needs to ask control plane.
So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane.
### Summary
@@ -250,7 +250,7 @@ Cons:
Pros:
- Easier to reason about if you don't have to account for pageserver restarts
- Easier to reason about if you dont have to account for pageserver restarts
### Extra notes
@@ -262,7 +262,7 @@ Delayed deletion can be done with both approaches. As discussed with Anna (@step
After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete.
To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge about paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesn't support shutting down pageservers, which are separate docker containers there instead of just processes.
To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes.
With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo.

View File

@@ -75,7 +75,7 @@ sequenceDiagram
```
At this point it is not possible to restore the state from index, it contains L2 which
is no longer available in s3 and doesn't contain L3 added by compaction by the
is no longer available in s3 and doesnt contain L3 added by compaction by the
first pageserver. So if any of the pageservers restart, initial sync will fail
(or in on-demand world it will fail a bit later during page request from
missing layer)
@@ -171,7 +171,7 @@ sequenceDiagram
Another problem is a possibility of concurrent branch creation calls.
I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we don't need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
## Simplistic approach

View File

@@ -55,7 +55,7 @@ When PostgreSQL requests a file, `compute_ctl` downloads it.
PostgreSQL requests files in the following cases:
- When loading a preload library set in `local_preload_libraries`
- When explicitly loading a library with `LOAD`
- When creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
#### Summary

View File

@@ -26,7 +26,7 @@ plane guarantee prevents robust response to failures, as if a pageserver is unre
we may not detach from it. The mechanism in this RFC fixes this, by making it safe to
attach to a new, different pageserver even if an unresponsive pageserver may be running.
Further lack of safety during split-brain conditions blocks two important features where occasional
Futher, lack of safety during split-brain conditions blocks two important features where occasional
split-brain conditions are part of the design assumptions:
- seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029))
@@ -490,11 +490,11 @@ The above makes it safe for control plane to change the assignment of
tenant to pageserver in control plane while a timeline creation is ongoing.
The reason is that the creation request against the new assigned pageserver
uses a new generation number. However, care must be taken by control plane
to ensure that a "timeline creation successful" response from some pageserver
to ensure that a "timeline creation successul" response from some pageserver
is checked for the pageserver's generation for that timeline's tenant still being the latest.
If it is not the latest, the response does not constitute a successful timeline creation.
It is acceptable to discard such responses, the scrubber will clean up the S3 state.
It is better to issue a timeline deletion request to the stale attachment.
It is better to issue a timelien deletion request to the stale attachment.
#### Timeline Deletion
@@ -633,7 +633,7 @@ As outlined in the Part 1 on correctness, it is critical that deletions are only
executed once the key is not referenced anywhere in S3.
This property is obviously upheld by the scheme above.
#### We Accept Object Leakage In Acceptable Circumstances
#### We Accept Object Leakage In Acceptable Circumcstances
If we crash in the flow above between (2) and (3), we lose track of unreferenced object.
Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk.

View File

@@ -162,7 +162,7 @@ struct Tenant {
...
txns: HashMap<TxnId, Transaction>,
// the most recently started txn's id; only most recently started can win
// the most recently started txn's id; only most recently sarted can win
next_winner_txn: Option<TxnId>,
}
struct Transaction {
@@ -186,7 +186,7 @@ A transaction T in state Committed has subsequent transactions that may or may n
So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged:
- Committed: delete objects on the deadlist.
- Commited: delete objects on the deadlist.
- We dont need a LIST request here, the deadlist is sufficient. So, its really cheap.
- This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they dont matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection theyre destined to get anyways. 404s for RejectAcknowledged is handled below.
- RejectAcknowledged: delete all objects created in that txn, and discard deadlists.
@@ -242,15 +242,15 @@ If a pageserver is unresponsive from Control Planes / Computes perspective
At this point, availability is restored and user pain relieved.
Whats left is to somehow close the doomed transaction of the unresponsive pageserver, so that it becomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
Whats left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above.
2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT):
1. Inspect the instance, investigate logs, understand root cause.
2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC.
3. Use below procedure to decommission pageserver.
3. Use below procedure to decomission pageserver.
### Decommissioning A Pageserver (Dead or Alive-but-Unresponsive)
### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive)
The solution, enabled by this proposal:
@@ -310,7 +310,7 @@ Issues that we discussed:
1. In abstract terms, this proposal provides a linearized history for a given S3 prefix.
2. In concrete terms, this proposal provides a linearized history per tenant.
3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history.
4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written proposal like this one:************************************************************************************
4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************
1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT
2. @Dmitry Rodionov :
3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment.

View File

@@ -54,7 +54,7 @@ If the compaction algorithm doesn't change between the two compaction runs, is d
*However*:
1. the file size of the overwritten L1s may not be identical, and
2. the bit pattern of the overwritten L1s may not be identical, and,
3. in the future, we may want to make the compaction code non-deterministic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
@@ -63,7 +63,7 @@ But node B based its world view on the version of node A's `index_part.json` fro
That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
Effectively, the data in the L1 has become inaccessible to node B.
If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same problem.
If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
@@ -121,7 +121,7 @@ Multi-object changes that previously created and removed files in timeline dir a
* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
* local timeline dir state:
* irrelevant for layer map content => irrelevant for atomic updates / crash consistency
* if we crash after index part PUT, local layer files will be used, so, no on-demand downloads needed for them
* if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
* if we crash before index part PUT, local layer files will be deleted
## Trade-Offs
@@ -140,7 +140,7 @@ Assuming upload queue allows for unlimited queue depth (that's what it does toda
* wal ingest: currently unbounded
* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
* Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
* In practice, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
* In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
* I have no intuition how expensive / long-running it is in reality.
* gc: `update_gc_info`` work (not substantial, AFAIK)
@@ -158,7 +158,7 @@ Pageserver crashes are very rare ; it would likely be acceptable to re-do the lo
However, regular pageserver restart happen frequently, e.g., during weekly deploys.
In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
They are a problem because other tenants that shut down quickly are unavailable while we wait for the slow tenants to shut down.
They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
A longer budget would expose tenants that are done early to a longer downtime.
A short budget would risk throwing away more work that'd have to be re-done after restart.
@@ -236,7 +236,7 @@ tenants/$tenant/timelines/$timeline/$key_and_lsn_range
tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
```
To guarantee uniqueness, the unique number is a sequence number, stored in `index_part.json`.
To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
This alternative does not solve atomic layer map updates.
In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
@@ -246,11 +246,11 @@ We'd need to write a deduplication pass that checks if perfectly overlapping lay
However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more acute.
But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
The proposed design in this RFC addresses both.
So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
That way, we avoid a phase where the crash-during-compaction problem is acute.
That way, we avoid a phase where the crash-during-compaction problem is accute.
## Related issues

View File

@@ -596,4 +596,4 @@ pageservers are updated to be aware of it.
As well as simplifying implementation, putting heatmaps in S3 will be useful
for future analytics purposes -- gathering aggregated statistics on activity
patterns across many tenants may be done directly from data in S3.
pattersn across many tenants may be done directly from data in S3.

View File

@@ -147,7 +147,7 @@ Separating corrupt writes from non-corrupt ones is a hard problem in general,
and if the application was involved in making the corrupt write, a recovery
would also involve the application. Therefore, corruption that has made it into
the WAL is outside of the scope of this feature. However, the WAL replay can be
issued to right before the point in time where the corruption occurred. Then the
issued to right before the point in time where the corruption occured. Then the
data loss is isolated to post-corruption writes only.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
@@ -161,7 +161,7 @@ limits and billing we apply to existing timelines.
## Proposed implementation
The first problem to keep in mind is the reproducibility of `initdb`.
The first problem to keep in mind is the reproducability of `initdb`.
So an initial step would be to upload `initdb` snapshots to S3.
After that, we'd have the endpoint spawn a background process which

View File

@@ -69,7 +69,7 @@ However, unlike above, an ideal solution will
* This means, read each `DiskBtree` page at most once.
* Facilitate merging of the reads we issue to the OS and eventually NVMe.
Each of these items above represents a significant amount of work.
Each of these items above represents a signficant amount of work.
## Performance

View File

@@ -21,7 +21,7 @@ implementation where we keep more data than we would need to, do not
change the synthetic size or incur any costs to the user.
The synthetic size is calculated for the whole project. It is not
straightforward to attribute size to individual branches. See "What is
straighforward to attribute size to individual branches. See "What is
the size of an individual branch?" for discussion on those
difficulties.
@@ -248,7 +248,7 @@ and truncate the WAL.
Synthetic size is calculated for the whole project, and includes all
branches. There is no such thing as the size of a branch, because it
is not straightforward to attribute the parts of size to individual
is not straighforward to attribute the parts of size to individual
branches.
## Example: attributing size to branches

View File

@@ -90,9 +90,6 @@ pub enum ComputeFeature {
/// track short-lived connections as user activity.
ActivityMonitorExperimental,
/// Pre-install and initialize anon extension for every database in the cluster
AnonExtension,
/// This is a special feature flag that is used to represent unknown feature flags.
/// Basically all unknown to enum flags are represented as this one. See unit test
/// `parse_unknown_features()` for more details.

View File

@@ -1,18 +0,0 @@
[package]
name = "desim"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
anyhow.workspace = true
rand.workspace = true
tracing.workspace = true
bytes.workspace = true
utils.workspace = true
parking_lot.workspace = true
hex.workspace = true
scopeguard.workspace = true
smallvec = { workspace = true, features = ["write"] }
workspace_hack.workspace = true

View File

@@ -1,7 +0,0 @@
# Discrete Event SIMulator
This is a library for running simulations of distributed systems. The main idea is borrowed from [FoundationDB](https://www.youtube.com/watch?v=4fFDFbi3toc).
Each node runs as a separate thread. This library was not optimized for speed yet, but it's already much faster than running usual intergration tests in real time, because it uses virtual simulation time and can fast-forward time to skip intervals where all nodes are doing nothing but sleeping or waiting for something.
The original purpose for this library is to test walproposer and safekeeper implementation working together, in a scenarios close to the real world environment. This simulator is determenistic and can inject failures in networking without waiting minutes of wall-time to trigger timeout, which makes it easier to find bugs in our consensus implementation compared to using integration tests.

View File

@@ -1,108 +0,0 @@
use std::{collections::VecDeque, sync::Arc};
use parking_lot::{Mutex, MutexGuard};
use crate::executor::{self, PollSome, Waker};
/// FIFO channel with blocking send and receive. Can be cloned and shared between threads.
/// Blocking functions should be used only from threads that are managed by the executor.
pub struct Chan<T> {
shared: Arc<State<T>>,
}
impl<T> Clone for Chan<T> {
fn clone(&self) -> Self {
Chan {
shared: self.shared.clone(),
}
}
}
impl<T> Default for Chan<T> {
fn default() -> Self {
Self::new()
}
}
impl<T> Chan<T> {
pub fn new() -> Chan<T> {
Chan {
shared: Arc::new(State {
queue: Mutex::new(VecDeque::new()),
waker: Waker::new(),
}),
}
}
/// Get a message from the front of the queue, block if the queue is empty.
/// If not called from the executor thread, it can block forever.
pub fn recv(&self) -> T {
self.shared.recv()
}
/// Panic if the queue is empty.
pub fn must_recv(&self) -> T {
self.shared
.try_recv()
.expect("message should've been ready")
}
/// Get a message from the front of the queue, return None if the queue is empty.
/// Never blocks.
pub fn try_recv(&self) -> Option<T> {
self.shared.try_recv()
}
/// Send a message to the back of the queue.
pub fn send(&self, t: T) {
self.shared.send(t);
}
}
struct State<T> {
queue: Mutex<VecDeque<T>>,
waker: Waker,
}
impl<T> State<T> {
fn send(&self, t: T) {
self.queue.lock().push_back(t);
self.waker.wake_all();
}
fn try_recv(&self) -> Option<T> {
let mut q = self.queue.lock();
q.pop_front()
}
fn recv(&self) -> T {
// interrupt the receiver to prevent consuming everything at once
executor::yield_me(0);
let mut queue = self.queue.lock();
if let Some(t) = queue.pop_front() {
return t;
}
loop {
self.waker.wake_me_later();
if let Some(t) = queue.pop_front() {
return t;
}
MutexGuard::unlocked(&mut queue, || {
executor::yield_me(-1);
});
}
}
}
impl<T> PollSome for Chan<T> {
/// Schedules a wakeup for the current thread.
fn wake_me(&self) {
self.shared.waker.wake_me_later();
}
/// Checks if chan has any pending messages.
fn has_some(&self) -> bool {
!self.shared.queue.lock().is_empty()
}
}

View File

@@ -1,483 +0,0 @@
use std::{
panic::AssertUnwindSafe,
sync::{
atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering},
mpsc, Arc, OnceLock,
},
thread::JoinHandle,
};
use tracing::{debug, error, trace};
use crate::time::Timing;
/// Stores status of the running threads. Threads are registered in the runtime upon creation
/// and deregistered upon termination.
pub struct Runtime {
// stores handles to all threads that are currently running
threads: Vec<ThreadHandle>,
// stores current time and pending wakeups
clock: Arc<Timing>,
// thread counter
thread_counter: AtomicU32,
// Thread step counter -- how many times all threads has been actually
// stepped (note that all world/time/executor/thread have slightly different
// meaning of steps). For observability.
pub step_counter: u64,
}
impl Runtime {
/// Init new runtime, no running threads.
pub fn new(clock: Arc<Timing>) -> Self {
Self {
threads: Vec::new(),
clock,
thread_counter: AtomicU32::new(0),
step_counter: 0,
}
}
/// Spawn a new thread and register it in the runtime.
pub fn spawn<F>(&mut self, f: F) -> ExternalHandle
where
F: FnOnce() + Send + 'static,
{
let (tx, rx) = mpsc::channel();
let clock = self.clock.clone();
let tid = self.thread_counter.fetch_add(1, Ordering::SeqCst);
debug!("spawning thread-{}", tid);
let join = std::thread::spawn(move || {
let _guard = tracing::info_span!("", tid).entered();
let res = std::panic::catch_unwind(AssertUnwindSafe(|| {
with_thread_context(|ctx| {
assert!(ctx.clock.set(clock).is_ok());
ctx.id.store(tid, Ordering::SeqCst);
tx.send(ctx.clone()).expect("failed to send thread context");
// suspend thread to put it to `threads` in sleeping state
ctx.yield_me(0);
});
// start user-provided function
f();
}));
debug!("thread finished");
if let Err(e) = res {
with_thread_context(|ctx| {
if !ctx.allow_panic.load(std::sync::atomic::Ordering::SeqCst) {
error!("thread panicked, terminating the process: {:?}", e);
std::process::exit(1);
}
debug!("thread panicked: {:?}", e);
let mut result = ctx.result.lock();
if result.0 == -1 {
*result = (256, format!("thread panicked: {:?}", e));
}
});
}
with_thread_context(|ctx| {
ctx.finish_me();
});
});
let ctx = rx.recv().expect("failed to receive thread context");
let handle = ThreadHandle::new(ctx.clone(), join);
self.threads.push(handle);
ExternalHandle { ctx }
}
/// Returns true if there are any unfinished activity, such as running thread or pending events.
/// Otherwise returns false, which means all threads are blocked forever.
pub fn step(&mut self) -> bool {
trace!("runtime step");
// have we run any thread?
let mut ran = false;
self.threads.retain(|thread: &ThreadHandle| {
let res = thread.ctx.wakeup.compare_exchange(
PENDING_WAKEUP,
NO_WAKEUP,
Ordering::SeqCst,
Ordering::SeqCst,
);
if res.is_err() {
// thread has no pending wakeups, leaving as is
return true;
}
ran = true;
trace!("entering thread-{}", thread.ctx.tid());
let status = thread.step();
self.step_counter += 1;
trace!(
"out of thread-{} with status {:?}",
thread.ctx.tid(),
status
);
if status == Status::Sleep {
true
} else {
trace!("thread has finished");
// removing the thread from the list
false
}
});
if !ran {
trace!("no threads were run, stepping clock");
if let Some(ctx_to_wake) = self.clock.step() {
trace!("waking up thread-{}", ctx_to_wake.tid());
ctx_to_wake.inc_wake();
} else {
return false;
}
}
true
}
/// Kill all threads. This is done by setting a flag in each thread context and waking it up.
pub fn crash_all_threads(&mut self) {
for thread in self.threads.iter() {
thread.ctx.crash_stop();
}
// all threads should be finished after a few steps
while !self.threads.is_empty() {
self.step();
}
}
}
impl Drop for Runtime {
fn drop(&mut self) {
debug!("dropping the runtime");
self.crash_all_threads();
}
}
#[derive(Clone)]
pub struct ExternalHandle {
ctx: Arc<ThreadContext>,
}
impl ExternalHandle {
/// Returns true if thread has finished execution.
pub fn is_finished(&self) -> bool {
let status = self.ctx.mutex.lock();
*status == Status::Finished
}
/// Returns exitcode and message, which is available after thread has finished execution.
pub fn result(&self) -> (i32, String) {
let result = self.ctx.result.lock();
result.clone()
}
/// Returns thread id.
pub fn id(&self) -> u32 {
self.ctx.id.load(Ordering::SeqCst)
}
/// Sets a flag to crash thread on the next wakeup.
pub fn crash_stop(&self) {
self.ctx.crash_stop();
}
}
struct ThreadHandle {
ctx: Arc<ThreadContext>,
_join: JoinHandle<()>,
}
impl ThreadHandle {
/// Create a new [`ThreadHandle`] and wait until thread will enter [`Status::Sleep`] state.
fn new(ctx: Arc<ThreadContext>, join: JoinHandle<()>) -> Self {
let mut status = ctx.mutex.lock();
// wait until thread will go into the first yield
while *status != Status::Sleep {
ctx.condvar.wait(&mut status);
}
drop(status);
Self { ctx, _join: join }
}
/// Allows thread to execute one step of its execution.
/// Returns [`Status`] of the thread after the step.
fn step(&self) -> Status {
let mut status = self.ctx.mutex.lock();
assert!(matches!(*status, Status::Sleep));
*status = Status::Running;
self.ctx.condvar.notify_all();
while *status == Status::Running {
self.ctx.condvar.wait(&mut status);
}
*status
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum Status {
/// Thread is running.
Running,
/// Waiting for event to complete, will be resumed by the executor step, once wakeup flag is set.
Sleep,
/// Thread finished execution.
Finished,
}
const NO_WAKEUP: u8 = 0;
const PENDING_WAKEUP: u8 = 1;
pub struct ThreadContext {
id: AtomicU32,
// used to block thread until it is woken up
mutex: parking_lot::Mutex<Status>,
condvar: parking_lot::Condvar,
// used as a flag to indicate runtime that thread is ready to be woken up
wakeup: AtomicU8,
clock: OnceLock<Arc<Timing>>,
// execution result, set by exit() call
result: parking_lot::Mutex<(i32, String)>,
// determines if process should be killed on receiving panic
allow_panic: AtomicBool,
// acts as a signal that thread should crash itself on the next wakeup
crash_request: AtomicBool,
}
impl ThreadContext {
pub(crate) fn new() -> Self {
Self {
id: AtomicU32::new(0),
mutex: parking_lot::Mutex::new(Status::Running),
condvar: parking_lot::Condvar::new(),
wakeup: AtomicU8::new(NO_WAKEUP),
clock: OnceLock::new(),
result: parking_lot::Mutex::new((-1, String::new())),
allow_panic: AtomicBool::new(false),
crash_request: AtomicBool::new(false),
}
}
}
// Functions for executor to control thread execution.
impl ThreadContext {
/// Set atomic flag to indicate that thread is ready to be woken up.
fn inc_wake(&self) {
self.wakeup.store(PENDING_WAKEUP, Ordering::SeqCst);
}
/// Internal function used for event queues.
pub(crate) fn schedule_wakeup(self: &Arc<Self>, after_ms: u64) {
self.clock
.get()
.unwrap()
.schedule_wakeup(after_ms, self.clone());
}
fn tid(&self) -> u32 {
self.id.load(Ordering::SeqCst)
}
fn crash_stop(&self) {
let status = self.mutex.lock();
if *status == Status::Finished {
debug!(
"trying to crash thread-{}, which is already finished",
self.tid()
);
return;
}
assert!(matches!(*status, Status::Sleep));
drop(status);
self.allow_panic.store(true, Ordering::SeqCst);
self.crash_request.store(true, Ordering::SeqCst);
// set a wakeup
self.inc_wake();
// it will panic on the next wakeup
}
}
// Internal functions.
impl ThreadContext {
/// Blocks thread until it's woken up by the executor. If `after_ms` is 0, is will be
/// woken on the next step. If `after_ms` > 0, wakeup is scheduled after that time.
/// Otherwise wakeup is not scheduled inside `yield_me`, and should be arranged before
/// calling this function.
fn yield_me(self: &Arc<Self>, after_ms: i64) {
let mut status = self.mutex.lock();
assert!(matches!(*status, Status::Running));
match after_ms.cmp(&0) {
std::cmp::Ordering::Less => {
// block until something wakes us up
}
std::cmp::Ordering::Equal => {
// tell executor that we are ready to be woken up
self.inc_wake();
}
std::cmp::Ordering::Greater => {
// schedule wakeup
self.clock
.get()
.unwrap()
.schedule_wakeup(after_ms as u64, self.clone());
}
}
*status = Status::Sleep;
self.condvar.notify_all();
// wait until executor wakes us up
while *status != Status::Running {
self.condvar.wait(&mut status);
}
if self.crash_request.load(Ordering::SeqCst) {
panic!("crashed by request");
}
}
/// Called only once, exactly before thread finishes execution.
fn finish_me(&self) {
let mut status = self.mutex.lock();
assert!(matches!(*status, Status::Running));
*status = Status::Finished;
{
let mut result = self.result.lock();
if result.0 == -1 {
*result = (0, "finished normally".to_owned());
}
}
self.condvar.notify_all();
}
}
/// Invokes the given closure with a reference to the current thread [`ThreadContext`].
#[inline(always)]
fn with_thread_context<T>(f: impl FnOnce(&Arc<ThreadContext>) -> T) -> T {
thread_local!(static THREAD_DATA: Arc<ThreadContext> = Arc::new(ThreadContext::new()));
THREAD_DATA.with(f)
}
/// Waker is used to wake up threads that are blocked on condition.
/// It keeps track of contexts [`Arc<ThreadContext>`] and can increment the counter
/// of several contexts to send a notification.
pub struct Waker {
// contexts that are waiting for a notification
contexts: parking_lot::Mutex<smallvec::SmallVec<[Arc<ThreadContext>; 8]>>,
}
impl Default for Waker {
fn default() -> Self {
Self::new()
}
}
impl Waker {
pub fn new() -> Self {
Self {
contexts: parking_lot::Mutex::new(smallvec::SmallVec::new()),
}
}
/// Subscribe current thread to receive a wake notification later.
pub fn wake_me_later(&self) {
with_thread_context(|ctx| {
self.contexts.lock().push(ctx.clone());
});
}
/// Wake up all threads that are waiting for a notification and clear the list.
pub fn wake_all(&self) {
let mut v = self.contexts.lock();
for ctx in v.iter() {
ctx.inc_wake();
}
v.clear();
}
}
/// See [`ThreadContext::yield_me`].
pub fn yield_me(after_ms: i64) {
with_thread_context(|ctx| ctx.yield_me(after_ms))
}
/// Get current time.
pub fn now() -> u64 {
with_thread_context(|ctx| ctx.clock.get().unwrap().now())
}
pub fn exit(code: i32, msg: String) {
with_thread_context(|ctx| {
ctx.allow_panic.store(true, Ordering::SeqCst);
let mut result = ctx.result.lock();
*result = (code, msg);
panic!("exit");
});
}
pub(crate) fn get_thread_ctx() -> Arc<ThreadContext> {
with_thread_context(|ctx| ctx.clone())
}
/// Trait for polling channels until they have something.
pub trait PollSome {
/// Schedule wakeup for message arrival.
fn wake_me(&self);
/// Check if channel has a ready message.
fn has_some(&self) -> bool;
}
/// Blocks current thread until one of the channels has a ready message. Returns
/// index of the channel that has a message. If timeout is reached, returns None.
///
/// Negative timeout means block forever. Zero timeout means check channels and return
/// immediately. Positive timeout means block until timeout is reached.
pub fn epoll_chans(chans: &[Box<dyn PollSome>], timeout: i64) -> Option<usize> {
let deadline = if timeout < 0 {
0
} else {
now() + timeout as u64
};
loop {
for chan in chans {
chan.wake_me()
}
for (i, chan) in chans.iter().enumerate() {
if chan.has_some() {
return Some(i);
}
}
if timeout < 0 {
// block until wakeup
yield_me(-1);
} else {
let current_time = now();
if current_time >= deadline {
return None;
}
yield_me((deadline - current_time) as i64);
}
}
}

View File

@@ -1,8 +0,0 @@
pub mod chan;
pub mod executor;
pub mod network;
pub mod node_os;
pub mod options;
pub mod proto;
pub mod time;
pub mod world;

View File

@@ -1,451 +0,0 @@
use std::{
cmp::Ordering,
collections::{BinaryHeap, VecDeque},
fmt::{self, Debug},
ops::DerefMut,
sync::{mpsc, Arc},
};
use parking_lot::{
lock_api::{MappedMutexGuard, MutexGuard},
Mutex, RawMutex,
};
use rand::rngs::StdRng;
use tracing::debug;
use crate::{
executor::{self, ThreadContext},
options::NetworkOptions,
proto::NetEvent,
proto::NodeEvent,
};
use super::{chan::Chan, proto::AnyMessage};
pub struct NetworkTask {
options: Arc<NetworkOptions>,
connections: Mutex<Vec<VirtualConnection>>,
/// min-heap of connections having something to deliver.
events: Mutex<BinaryHeap<Event>>,
task_context: Arc<ThreadContext>,
}
impl NetworkTask {
pub fn start_new(options: Arc<NetworkOptions>, tx: mpsc::Sender<Arc<NetworkTask>>) {
let ctx = executor::get_thread_ctx();
let task = Arc::new(Self {
options,
connections: Mutex::new(Vec::new()),
events: Mutex::new(BinaryHeap::new()),
task_context: ctx,
});
// send the task upstream
tx.send(task.clone()).unwrap();
// start the task
task.start();
}
pub fn start_new_connection(self: &Arc<Self>, rng: StdRng, dst_accept: Chan<NodeEvent>) -> TCP {
let now = executor::now();
let connection_id = self.connections.lock().len();
let vc = VirtualConnection {
connection_id,
dst_accept,
dst_sockets: [Chan::new(), Chan::new()],
state: Mutex::new(ConnectionState {
buffers: [NetworkBuffer::new(None), NetworkBuffer::new(Some(now))],
rng,
}),
};
vc.schedule_timeout(self);
vc.send_connect(self);
let recv_chan = vc.dst_sockets[0].clone();
self.connections.lock().push(vc);
TCP {
net: self.clone(),
conn_id: connection_id,
dir: 0,
recv_chan,
}
}
}
// private functions
impl NetworkTask {
/// Schedule to wakeup network task (self) `after_ms` later to deliver
/// messages of connection `id`.
fn schedule(&self, id: usize, after_ms: u64) {
self.events.lock().push(Event {
time: executor::now() + after_ms,
conn_id: id,
});
self.task_context.schedule_wakeup(after_ms);
}
/// Get locked connection `id`.
fn get(&self, id: usize) -> MappedMutexGuard<'_, RawMutex, VirtualConnection> {
MutexGuard::map(self.connections.lock(), |connections| {
connections.get_mut(id).unwrap()
})
}
fn collect_pending_events(&self, now: u64, vec: &mut Vec<Event>) {
vec.clear();
let mut events = self.events.lock();
while let Some(event) = events.peek() {
if event.time > now {
break;
}
let event = events.pop().unwrap();
vec.push(event);
}
}
fn start(self: &Arc<Self>) {
debug!("started network task");
let mut events = Vec::new();
loop {
let now = executor::now();
self.collect_pending_events(now, &mut events);
for event in events.drain(..) {
let conn = self.get(event.conn_id);
conn.process(self);
}
// block until wakeup
executor::yield_me(-1);
}
}
}
// 0 - from node(0) to node(1)
// 1 - from node(1) to node(0)
type MessageDirection = u8;
fn sender_str(dir: MessageDirection) -> &'static str {
match dir {
0 => "client",
1 => "server",
_ => unreachable!(),
}
}
fn receiver_str(dir: MessageDirection) -> &'static str {
match dir {
0 => "server",
1 => "client",
_ => unreachable!(),
}
}
/// Virtual connection between two nodes.
/// Node 0 is the creator of the connection (client),
/// and node 1 is the acceptor (server).
struct VirtualConnection {
connection_id: usize,
/// one-off chan, used to deliver Accept message to dst
dst_accept: Chan<NodeEvent>,
/// message sinks
dst_sockets: [Chan<NetEvent>; 2],
state: Mutex<ConnectionState>,
}
struct ConnectionState {
buffers: [NetworkBuffer; 2],
rng: StdRng,
}
impl VirtualConnection {
/// Notify the future about the possible timeout.
fn schedule_timeout(&self, net: &NetworkTask) {
if let Some(timeout) = net.options.keepalive_timeout {
net.schedule(self.connection_id, timeout);
}
}
/// Send the handshake (Accept) to the server.
fn send_connect(&self, net: &NetworkTask) {
let now = executor::now();
let mut state = self.state.lock();
let delay = net.options.connect_delay.delay(&mut state.rng);
let buffer = &mut state.buffers[0];
assert!(buffer.buf.is_empty());
assert!(!buffer.recv_closed);
assert!(!buffer.send_closed);
assert!(buffer.last_recv.is_none());
let delay = if let Some(ms) = delay {
ms
} else {
debug!("NET: TCP #{} dropped connect", self.connection_id);
buffer.send_closed = true;
return;
};
// Send a message into the future.
buffer
.buf
.push_back((now + delay, AnyMessage::InternalConnect));
net.schedule(self.connection_id, delay);
}
/// Transmit some of the messages from the buffer to the nodes.
fn process(&self, net: &Arc<NetworkTask>) {
let now = executor::now();
let mut state = self.state.lock();
for direction in 0..2 {
self.process_direction(
net,
state.deref_mut(),
now,
direction as MessageDirection,
&self.dst_sockets[direction ^ 1],
);
}
// Close the one side of the connection by timeout if the node
// has not received any messages for a long time.
if let Some(timeout) = net.options.keepalive_timeout {
let mut to_close = [false, false];
for direction in 0..2 {
let buffer = &mut state.buffers[direction];
if buffer.recv_closed {
continue;
}
if let Some(last_recv) = buffer.last_recv {
if now - last_recv >= timeout {
debug!(
"NET: connection {} timed out at {}",
self.connection_id,
receiver_str(direction as MessageDirection)
);
let node_idx = direction ^ 1;
to_close[node_idx] = true;
}
}
}
drop(state);
for (node_idx, should_close) in to_close.iter().enumerate() {
if *should_close {
self.close(node_idx);
}
}
}
}
/// Process messages in the buffer in the given direction.
fn process_direction(
&self,
net: &Arc<NetworkTask>,
state: &mut ConnectionState,
now: u64,
direction: MessageDirection,
to_socket: &Chan<NetEvent>,
) {
let buffer = &mut state.buffers[direction as usize];
if buffer.recv_closed {
assert!(buffer.buf.is_empty());
}
while !buffer.buf.is_empty() && buffer.buf.front().unwrap().0 <= now {
let msg = buffer.buf.pop_front().unwrap().1;
buffer.last_recv = Some(now);
self.schedule_timeout(net);
if let AnyMessage::InternalConnect = msg {
// TODO: assert to_socket is the server
let server_to_client = TCP {
net: net.clone(),
conn_id: self.connection_id,
dir: direction ^ 1,
recv_chan: to_socket.clone(),
};
// special case, we need to deliver new connection to a separate channel
self.dst_accept.send(NodeEvent::Accept(server_to_client));
} else {
to_socket.send(NetEvent::Message(msg));
}
}
}
/// Try to send a message to the buffer, optionally dropping it and
/// determining delivery timestamp.
fn send(&self, net: &NetworkTask, direction: MessageDirection, msg: AnyMessage) {
let now = executor::now();
let mut state = self.state.lock();
let (delay, close) = if let Some(ms) = net.options.send_delay.delay(&mut state.rng) {
(ms, false)
} else {
(0, true)
};
let buffer = &mut state.buffers[direction as usize];
if buffer.send_closed {
debug!(
"NET: TCP #{} dropped message {:?} (broken pipe)",
self.connection_id, msg
);
return;
}
if close {
debug!(
"NET: TCP #{} dropped message {:?} (pipe just broke)",
self.connection_id, msg
);
buffer.send_closed = true;
return;
}
if buffer.recv_closed {
debug!(
"NET: TCP #{} dropped message {:?} (recv closed)",
self.connection_id, msg
);
return;
}
// Send a message into the future.
buffer.buf.push_back((now + delay, msg));
net.schedule(self.connection_id, delay);
}
/// Close the connection. Only one side of the connection will be closed,
/// and no further messages will be delivered. The other side will not be notified.
fn close(&self, node_idx: usize) {
let mut state = self.state.lock();
let recv_buffer = &mut state.buffers[1 ^ node_idx];
if recv_buffer.recv_closed {
debug!(
"NET: TCP #{} closed twice at {}",
self.connection_id,
sender_str(node_idx as MessageDirection),
);
return;
}
debug!(
"NET: TCP #{} closed at {}",
self.connection_id,
sender_str(node_idx as MessageDirection),
);
recv_buffer.recv_closed = true;
for msg in recv_buffer.buf.drain(..) {
debug!(
"NET: TCP #{} dropped message {:?} (closed)",
self.connection_id, msg
);
}
let send_buffer = &mut state.buffers[node_idx];
send_buffer.send_closed = true;
drop(state);
// TODO: notify the other side?
self.dst_sockets[node_idx].send(NetEvent::Closed);
}
}
struct NetworkBuffer {
/// Messages paired with time of delivery
buf: VecDeque<(u64, AnyMessage)>,
/// True if the connection is closed on the receiving side,
/// i.e. no more messages from the buffer will be delivered.
recv_closed: bool,
/// True if the connection is closed on the sending side,
/// i.e. no more messages will be added to the buffer.
send_closed: bool,
/// Last time a message was delivered from the buffer.
/// If None, it means that the server is the receiver and
/// it has not yet aware of this connection (i.e. has not
/// received the Accept).
last_recv: Option<u64>,
}
impl NetworkBuffer {
fn new(last_recv: Option<u64>) -> Self {
Self {
buf: VecDeque::new(),
recv_closed: false,
send_closed: false,
last_recv,
}
}
}
/// Single end of a bidirectional network stream without reordering (TCP-like).
/// Reads are implemented using channels, writes go to the buffer inside VirtualConnection.
pub struct TCP {
net: Arc<NetworkTask>,
conn_id: usize,
dir: MessageDirection,
recv_chan: Chan<NetEvent>,
}
impl Debug for TCP {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "TCP #{} ({})", self.conn_id, sender_str(self.dir),)
}
}
impl TCP {
/// Send a message to the other side. It's guaranteed that it will not arrive
/// before the arrival of all messages sent earlier.
pub fn send(&self, msg: AnyMessage) {
let conn = self.net.get(self.conn_id);
conn.send(&self.net, self.dir, msg);
}
/// Get a channel to receive incoming messages.
pub fn recv_chan(&self) -> Chan<NetEvent> {
self.recv_chan.clone()
}
pub fn connection_id(&self) -> usize {
self.conn_id
}
pub fn close(&self) {
let conn = self.net.get(self.conn_id);
conn.close(self.dir as usize);
}
}
struct Event {
time: u64,
conn_id: usize,
}
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
// to get that.
impl PartialOrd for Event {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Event {
fn cmp(&self, other: &Self) -> Ordering {
(other.time, other.conn_id).cmp(&(self.time, self.conn_id))
}
}
impl PartialEq for Event {
fn eq(&self, other: &Self) -> bool {
(other.time, other.conn_id) == (self.time, self.conn_id)
}
}
impl Eq for Event {}

View File

@@ -1,54 +0,0 @@
use std::sync::Arc;
use rand::Rng;
use crate::proto::NodeEvent;
use super::{
chan::Chan,
network::TCP,
world::{Node, NodeId, World},
};
/// Abstraction with all functions (aka syscalls) available to the node.
#[derive(Clone)]
pub struct NodeOs {
world: Arc<World>,
internal: Arc<Node>,
}
impl NodeOs {
pub fn new(world: Arc<World>, internal: Arc<Node>) -> NodeOs {
NodeOs { world, internal }
}
/// Get the node id.
pub fn id(&self) -> NodeId {
self.internal.id
}
/// Opens a bidirectional connection with the other node. Always successful.
pub fn open_tcp(&self, dst: NodeId) -> TCP {
self.world.open_tcp(dst)
}
/// Returns a channel to receive node events (socket Accept and internal messages).
pub fn node_events(&self) -> Chan<NodeEvent> {
self.internal.node_events()
}
/// Get current time.
pub fn now(&self) -> u64 {
self.world.now()
}
/// Generate a random number in range [0, max).
pub fn random(&self, max: u64) -> u64 {
self.internal.rng.lock().gen_range(0..max)
}
/// Append a new event to the world event log.
pub fn log_event(&self, data: String) {
self.internal.log_event(data)
}
}

View File

@@ -1,50 +0,0 @@
use rand::{rngs::StdRng, Rng};
/// Describes random delays and failures. Delay will be uniformly distributed in [min, max].
/// Connection failure will occur with the probablity fail_prob.
#[derive(Clone, Debug)]
pub struct Delay {
pub min: u64,
pub max: u64,
pub fail_prob: f64, // [0; 1]
}
impl Delay {
/// Create a struct with no delay, no failures.
pub fn empty() -> Delay {
Delay {
min: 0,
max: 0,
fail_prob: 0.0,
}
}
/// Create a struct with a fixed delay.
pub fn fixed(ms: u64) -> Delay {
Delay {
min: ms,
max: ms,
fail_prob: 0.0,
}
}
/// Generate a random delay in range [min, max]. Return None if the
/// message should be dropped.
pub fn delay(&self, rng: &mut StdRng) -> Option<u64> {
if rng.gen_bool(self.fail_prob) {
return None;
}
Some(rng.gen_range(self.min..=self.max))
}
}
/// Describes network settings. All network packets will be subjected to the same delays and failures.
#[derive(Clone, Debug)]
pub struct NetworkOptions {
/// Connection will be automatically closed after this timeout if no data is received.
pub keepalive_timeout: Option<u64>,
/// New connections will be delayed by this amount of time.
pub connect_delay: Delay,
/// Each message will be delayed by this amount of time.
pub send_delay: Delay,
}

View File

@@ -1,63 +0,0 @@
use std::fmt::Debug;
use bytes::Bytes;
use utils::lsn::Lsn;
use crate::{network::TCP, world::NodeId};
/// Internal node events.
#[derive(Debug)]
pub enum NodeEvent {
Accept(TCP),
Internal(AnyMessage),
}
/// Events that are coming from a network socket.
#[derive(Clone, Debug)]
pub enum NetEvent {
Message(AnyMessage),
Closed,
}
/// Custom events generated throughout the simulation. Can be used by the test to verify the correctness.
#[derive(Debug)]
pub struct SimEvent {
pub time: u64,
pub node: NodeId,
pub data: String,
}
/// Umbrella type for all possible flavours of messages. These events can be sent over network
/// or to an internal node events channel.
#[derive(Clone)]
pub enum AnyMessage {
/// Not used, empty placeholder.
None,
/// Used internally for notifying node about new incoming connection.
InternalConnect,
Just32(u32),
ReplCell(ReplCell),
Bytes(Bytes),
LSN(u64),
}
impl Debug for AnyMessage {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
AnyMessage::None => write!(f, "None"),
AnyMessage::InternalConnect => write!(f, "InternalConnect"),
AnyMessage::Just32(v) => write!(f, "Just32({})", v),
AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v),
AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)),
AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)),
}
}
}
/// Used in reliable_copy_test.rs
#[derive(Clone, Debug)]
pub struct ReplCell {
pub value: u32,
pub client_id: u32,
pub seqno: u32,
}

View File

@@ -1,129 +0,0 @@
use std::{
cmp::Ordering,
collections::BinaryHeap,
ops::DerefMut,
sync::{
atomic::{AtomicU32, AtomicU64},
Arc,
},
};
use parking_lot::Mutex;
use tracing::trace;
use crate::executor::ThreadContext;
/// Holds current time and all pending wakeup events.
pub struct Timing {
/// Current world's time.
current_time: AtomicU64,
/// Pending timers.
queue: Mutex<BinaryHeap<Pending>>,
/// Global nonce. Makes picking events from binary heap queue deterministic
/// by appending a number to events with the same timestamp.
nonce: AtomicU32,
/// Used to schedule fake events.
fake_context: Arc<ThreadContext>,
}
impl Default for Timing {
fn default() -> Self {
Self::new()
}
}
impl Timing {
/// Create a new empty clock with time set to 0.
pub fn new() -> Timing {
Timing {
current_time: AtomicU64::new(0),
queue: Mutex::new(BinaryHeap::new()),
nonce: AtomicU32::new(0),
fake_context: Arc::new(ThreadContext::new()),
}
}
/// Return the current world's time.
pub fn now(&self) -> u64 {
self.current_time.load(std::sync::atomic::Ordering::SeqCst)
}
/// Tick-tock the global clock. Return the event ready to be processed
/// or move the clock forward and then return the event.
pub(crate) fn step(&self) -> Option<Arc<ThreadContext>> {
let mut queue = self.queue.lock();
if queue.is_empty() {
// no future events
return None;
}
if !self.is_event_ready(queue.deref_mut()) {
let next_time = queue.peek().unwrap().time;
self.current_time
.store(next_time, std::sync::atomic::Ordering::SeqCst);
trace!("rewind time to {}", next_time);
assert!(self.is_event_ready(queue.deref_mut()));
}
Some(queue.pop().unwrap().wake_context)
}
/// Append an event to the queue, to wakeup the thread in `ms` milliseconds.
pub(crate) fn schedule_wakeup(&self, ms: u64, wake_context: Arc<ThreadContext>) {
self.nonce.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
let nonce = self.nonce.load(std::sync::atomic::Ordering::SeqCst);
self.queue.lock().push(Pending {
time: self.now() + ms,
nonce,
wake_context,
})
}
/// Append a fake event to the queue, to prevent clocks from skipping this time.
pub fn schedule_fake(&self, ms: u64) {
self.queue.lock().push(Pending {
time: self.now() + ms,
nonce: 0,
wake_context: self.fake_context.clone(),
});
}
/// Return true if there is a ready event.
fn is_event_ready(&self, queue: &mut BinaryHeap<Pending>) -> bool {
queue.peek().map_or(false, |x| x.time <= self.now())
}
/// Clear all pending events.
pub(crate) fn clear(&self) {
self.queue.lock().clear();
}
}
struct Pending {
time: u64,
nonce: u32,
wake_context: Arc<ThreadContext>,
}
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
// to get that.
impl PartialOrd for Pending {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Pending {
fn cmp(&self, other: &Self) -> Ordering {
(other.time, other.nonce).cmp(&(self.time, self.nonce))
}
}
impl PartialEq for Pending {
fn eq(&self, other: &Self) -> bool {
(other.time, other.nonce) == (self.time, self.nonce)
}
}
impl Eq for Pending {}

View File

@@ -1,180 +0,0 @@
use parking_lot::Mutex;
use rand::{rngs::StdRng, SeedableRng};
use std::{
ops::DerefMut,
sync::{mpsc, Arc},
};
use crate::{
executor::{ExternalHandle, Runtime},
network::NetworkTask,
options::NetworkOptions,
proto::{NodeEvent, SimEvent},
time::Timing,
};
use super::{chan::Chan, network::TCP, node_os::NodeOs};
pub type NodeId = u32;
/// World contains simulation state.
pub struct World {
nodes: Mutex<Vec<Arc<Node>>>,
/// Random number generator.
rng: Mutex<StdRng>,
/// Internal event log.
events: Mutex<Vec<SimEvent>>,
/// Separate task that processes all network messages.
network_task: Arc<NetworkTask>,
/// Runtime for running threads and moving time.
runtime: Mutex<Runtime>,
/// To get current time.
timing: Arc<Timing>,
}
impl World {
pub fn new(seed: u64, options: Arc<NetworkOptions>) -> World {
let timing = Arc::new(Timing::new());
let mut runtime = Runtime::new(timing.clone());
let (tx, rx) = mpsc::channel();
runtime.spawn(move || {
// create and start network background thread, and send it back via the channel
NetworkTask::start_new(options, tx)
});
// wait for the network task to start
while runtime.step() {}
let network_task = rx.recv().unwrap();
World {
nodes: Mutex::new(Vec::new()),
rng: Mutex::new(StdRng::seed_from_u64(seed)),
events: Mutex::new(Vec::new()),
network_task,
runtime: Mutex::new(runtime),
timing,
}
}
pub fn step(&self) -> bool {
self.runtime.lock().step()
}
pub fn get_thread_step_count(&self) -> u64 {
self.runtime.lock().step_counter
}
/// Create a new random number generator.
pub fn new_rng(&self) -> StdRng {
let mut rng = self.rng.lock();
StdRng::from_rng(rng.deref_mut()).unwrap()
}
/// Create a new node.
pub fn new_node(self: &Arc<Self>) -> Arc<Node> {
let mut nodes = self.nodes.lock();
let id = nodes.len() as NodeId;
let node = Arc::new(Node::new(id, self.clone(), self.new_rng()));
nodes.push(node.clone());
node
}
/// Get an internal node state by id.
fn get_node(&self, id: NodeId) -> Option<Arc<Node>> {
let nodes = self.nodes.lock();
let num = id as usize;
if num < nodes.len() {
Some(nodes[num].clone())
} else {
None
}
}
pub fn stop_all(&self) {
self.runtime.lock().crash_all_threads();
}
/// Returns a writable end of a TCP connection, to send src->dst messages.
pub fn open_tcp(self: &Arc<World>, dst: NodeId) -> TCP {
// TODO: replace unwrap() with /dev/null socket.
let dst = self.get_node(dst).unwrap();
let dst_accept = dst.node_events.lock().clone();
let rng = self.new_rng();
self.network_task.start_new_connection(rng, dst_accept)
}
/// Get current time.
pub fn now(&self) -> u64 {
self.timing.now()
}
/// Get a copy of the internal clock.
pub fn clock(&self) -> Arc<Timing> {
self.timing.clone()
}
pub fn add_event(&self, node: NodeId, data: String) {
let time = self.now();
self.events.lock().push(SimEvent { time, node, data });
}
pub fn take_events(&self) -> Vec<SimEvent> {
let mut events = self.events.lock();
let mut res = Vec::new();
std::mem::swap(&mut res, &mut events);
res
}
pub fn deallocate(&self) {
self.stop_all();
self.timing.clear();
self.nodes.lock().clear();
}
}
/// Internal node state.
pub struct Node {
pub id: NodeId,
node_events: Mutex<Chan<NodeEvent>>,
world: Arc<World>,
pub(crate) rng: Mutex<StdRng>,
}
impl Node {
pub fn new(id: NodeId, world: Arc<World>, rng: StdRng) -> Node {
Node {
id,
node_events: Mutex::new(Chan::new()),
world,
rng: Mutex::new(rng),
}
}
/// Spawn a new thread with this node context.
pub fn launch(self: &Arc<Self>, f: impl FnOnce(NodeOs) + Send + 'static) -> ExternalHandle {
let node = self.clone();
let world = self.world.clone();
self.world.runtime.lock().spawn(move || {
f(NodeOs::new(world, node.clone()));
})
}
/// Returns a channel to receive Accepts and internal messages.
pub fn node_events(&self) -> Chan<NodeEvent> {
self.node_events.lock().clone()
}
/// This will drop all in-flight Accept messages.
pub fn replug_node_events(&self, chan: Chan<NodeEvent>) {
*self.node_events.lock() = chan;
}
/// Append event to the world's log.
pub fn log_event(&self, data: String) {
self.world.add_event(self.id, data)
}
}

View File

@@ -1,244 +0,0 @@
//! Simple test to verify that simulator is working.
#[cfg(test)]
mod reliable_copy_test {
use anyhow::Result;
use desim::executor::{self, PollSome};
use desim::options::{Delay, NetworkOptions};
use desim::proto::{NetEvent, NodeEvent, ReplCell};
use desim::world::{NodeId, World};
use desim::{node_os::NodeOs, proto::AnyMessage};
use parking_lot::Mutex;
use std::sync::Arc;
use tracing::info;
/// Disk storage trait and implementation.
pub trait Storage<T> {
fn flush_pos(&self) -> u32;
fn flush(&mut self) -> Result<()>;
fn write(&mut self, t: T);
}
#[derive(Clone)]
pub struct SharedStorage<T> {
pub state: Arc<Mutex<InMemoryStorage<T>>>,
}
impl<T> SharedStorage<T> {
pub fn new() -> Self {
Self {
state: Arc::new(Mutex::new(InMemoryStorage::new())),
}
}
}
impl<T> Storage<T> for SharedStorage<T> {
fn flush_pos(&self) -> u32 {
self.state.lock().flush_pos
}
fn flush(&mut self) -> Result<()> {
executor::yield_me(0);
self.state.lock().flush()
}
fn write(&mut self, t: T) {
executor::yield_me(0);
self.state.lock().write(t);
}
}
pub struct InMemoryStorage<T> {
pub data: Vec<T>,
pub flush_pos: u32,
}
impl<T> InMemoryStorage<T> {
pub fn new() -> Self {
Self {
data: Vec::new(),
flush_pos: 0,
}
}
pub fn flush(&mut self) -> Result<()> {
self.flush_pos = self.data.len() as u32;
Ok(())
}
pub fn write(&mut self, t: T) {
self.data.push(t);
}
}
/// Server implementation.
pub fn run_server(os: NodeOs, mut storage: Box<dyn Storage<u32>>) {
info!("started server");
let node_events = os.node_events();
let mut epoll_vec: Vec<Box<dyn PollSome>> = vec![Box::new(node_events.clone())];
let mut sockets = vec![];
loop {
let index = executor::epoll_chans(&epoll_vec, -1).unwrap();
if index == 0 {
let node_event = node_events.must_recv();
info!("got node event: {:?}", node_event);
if let NodeEvent::Accept(tcp) = node_event {
tcp.send(AnyMessage::Just32(storage.flush_pos()));
epoll_vec.push(Box::new(tcp.recv_chan()));
sockets.push(tcp);
}
continue;
}
let recv_chan = sockets[index - 1].recv_chan();
let socket = &sockets[index - 1];
let event = recv_chan.must_recv();
info!("got event: {:?}", event);
if let NetEvent::Message(AnyMessage::ReplCell(cell)) = event {
if cell.seqno != storage.flush_pos() {
info!("got out of order data: {:?}", cell);
continue;
}
storage.write(cell.value);
storage.flush().unwrap();
socket.send(AnyMessage::Just32(storage.flush_pos()));
}
}
}
/// Client copies all data from array to the remote node.
pub fn run_client(os: NodeOs, data: &[ReplCell], dst: NodeId) {
info!("started client");
let mut delivered = 0;
let mut sock = os.open_tcp(dst);
let mut recv_chan = sock.recv_chan();
while delivered < data.len() {
let num = &data[delivered];
info!("sending data: {:?}", num.clone());
sock.send(AnyMessage::ReplCell(num.clone()));
// loop {
let event = recv_chan.recv();
match event {
NetEvent::Message(AnyMessage::Just32(flush_pos)) => {
if flush_pos == 1 + delivered as u32 {
delivered += 1;
}
}
NetEvent::Closed => {
info!("connection closed, reestablishing");
sock = os.open_tcp(dst);
recv_chan = sock.recv_chan();
}
_ => {}
}
// }
}
let sock = os.open_tcp(dst);
for num in data {
info!("sending data: {:?}", num.clone());
sock.send(AnyMessage::ReplCell(num.clone()));
}
info!("sent all data and finished client");
}
/// Run test simulations.
#[test]
fn sim_example_reliable_copy() {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
utils::logging::Output::Stdout,
)
.expect("logging init failed");
let delay = Delay {
min: 1,
max: 60,
fail_prob: 0.4,
};
let network = NetworkOptions {
keepalive_timeout: Some(50),
connect_delay: delay.clone(),
send_delay: delay.clone(),
};
for seed in 0..20 {
let u32_data: [u32; 5] = [1, 2, 3, 4, 5];
let data = u32_to_cells(&u32_data, 1);
let world = Arc::new(World::new(seed, Arc::new(network.clone())));
start_simulation(Options {
world,
time_limit: 1_000_000,
client_fn: Box::new(move |os, server_id| run_client(os, &data, server_id)),
u32_data,
});
}
}
pub struct Options {
pub world: Arc<World>,
pub time_limit: u64,
pub u32_data: [u32; 5],
pub client_fn: Box<dyn FnOnce(NodeOs, u32) + Send + 'static>,
}
pub fn start_simulation(options: Options) {
let world = options.world;
let client_node = world.new_node();
let server_node = world.new_node();
let server_id = server_node.id;
// start the client thread
client_node.launch(move |os| {
let client_fn = options.client_fn;
client_fn(os, server_id);
});
// start the server thread
let shared_storage = SharedStorage::new();
let server_storage = shared_storage.clone();
server_node.launch(move |os| run_server(os, Box::new(server_storage)));
while world.step() && world.now() < options.time_limit {}
let disk_data = shared_storage.state.lock().data.clone();
assert!(verify_data(&disk_data, &options.u32_data[..]));
}
pub fn u32_to_cells(data: &[u32], client_id: u32) -> Vec<ReplCell> {
let mut res = Vec::new();
for (i, _) in data.iter().enumerate() {
res.push(ReplCell {
client_id,
seqno: i as u32,
value: data[i],
});
}
res
}
fn verify_data(disk_data: &[u32], data: &[u32]) -> bool {
if disk_data.len() != data.len() {
return false;
}
for i in 0..data.len() {
if disk_data[i] != data[i] {
return false;
}
}
true
}
}

View File

@@ -9,13 +9,5 @@ prometheus.workspace = true
libc.workspace = true
once_cell.workspace = true
chrono.workspace = true
twox-hash.workspace = true
workspace_hack.workspace = true
[target.'cfg(target_os = "linux")'.dependencies]
procfs.workspace = true
[dev-dependencies]
rand = "0.8"
rand_distr = "0.4.3"

View File

@@ -1,523 +0,0 @@
//! HyperLogLog is an algorithm for the count-distinct problem,
//! approximating the number of distinct elements in a multiset.
//! Calculating the exact cardinality of the distinct elements
//! of a multiset requires an amount of memory proportional to
//! the cardinality, which is impractical for very large data sets.
//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
//! use significantly less memory than this, but can only approximate the cardinality.
use std::{
collections::HashMap,
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
sync::{atomic::AtomicU8, Arc, RwLock},
};
use prometheus::{
core::{self, Describer},
proto, Opts,
};
use twox_hash::xxh3;
/// Create an [`HyperLogLogVec`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_hll_vec {
($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
$crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
}};
($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
$crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
}};
}
/// Create an [`HyperLogLog`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_hll {
($N:literal, $OPTS:expr $(,)?) => {{
let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
$crate::register(Box::new(hll.clone())).map(|_| hll)
}};
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
}};
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLogVec<const N: usize> {
core: Arc<HyperLogLogVecCore<N>>,
}
struct HyperLogLogVecCore<const N: usize> {
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
pub desc: core::Desc,
pub opts: Opts,
}
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
for child in self.core.children.read().unwrap().values() {
child.core.collect_into(&mut metrics);
}
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogVec<N> {
/// Create a new [`HyperLogLogVec`] based on the provided
/// [`Opts`] and partitioned by the given label names. At least one label name must be
/// provided.
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
let opts = opts.variable_labels(variable_names);
let desc = opts.describe()?;
let v = HyperLogLogVecCore {
children: RwLock::new(HashMap::default()),
desc,
opts,
};
Ok(Self { core: Arc::new(v) })
}
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
self.core.get_metric_with_label_values(vals)
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
self.get_metric_with_label_values(vals).unwrap()
}
}
impl<const N: usize> HyperLogLogVecCore<N> {
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let h = self.hash_label_values(vals)?;
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
return Ok(metric);
}
self.get_or_create_metric(h, vals)
}
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
if vals.len() != self.desc.variable_labels.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: self.desc.variable_labels.len(),
got: vals.len(),
});
}
let mut h = xxh3::Hash64::default();
for val in vals {
h.write(val.as_bytes());
}
Ok(h.finish())
}
fn get_or_create_metric(
&self,
hash: u64,
label_values: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let mut children = self.children.write().unwrap();
// Check exist first.
if let Some(metric) = children.get(&hash).cloned() {
return Ok(metric);
}
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
children.insert(hash, metric.clone());
Ok(metric)
}
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLog<const N: usize> {
core: Arc<HyperLogLogCore<N>>,
}
impl<const N: usize> HyperLogLog<N> {
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let opts = Opts::new(name, help);
Self::with_opts(opts)
}
/// Create a [`HyperLogLog`] with the `opts` options.
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
Self::with_opts_and_label_values(&opts, &[])
}
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
let desc = opts.describe()?;
let labels = make_label_pairs(&desc, label_values)?;
let v = HyperLogLogCore {
shards: [0; N].map(AtomicU8::new),
desc,
labels,
};
Ok(Self { core: Arc::new(v) })
}
pub fn measure(&self, item: &impl Hash) {
// changing the hasher will break compatibility with previous measurements.
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
}
fn record(&self, hash: u64) {
let p = N.ilog2() as u8;
let j = hash & (N as u64 - 1);
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
}
}
struct HyperLogLogCore<const N: usize> {
shards: [AtomicU8; N],
desc: core::Desc,
labels: Vec<proto::LabelPair>,
}
impl<const N: usize> core::Collector for HyperLogLog<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
self.core.collect_into(&mut metrics);
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogCore<N> {
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
self.shards.iter().enumerate().for_each(|(i, x)| {
let mut shard_label = proto::LabelPair::default();
shard_label.set_name("hll_shard".to_owned());
shard_label.set_value(format!("{i}"));
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
// This seems like it would be a race condition,
// but HLL is not impacted by a write in one shard happening in between.
// This is because in PromQL we will be implementing a harmonic mean of all buckets.
// we will also merge samples in a time series using `max by (hll_shard)`.
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
let mut m = proto::Metric::default();
let mut c = proto::Gauge::default();
c.set_value(v as f64);
m.set_gauge(c);
let mut labels = Vec::with_capacity(self.labels.len() + 1);
labels.extend_from_slice(&self.labels);
labels.push(shard_label);
m.set_label(labels);
metrics.push(m);
})
}
}
fn make_label_pairs(
desc: &core::Desc,
label_values: &[&str],
) -> prometheus::Result<Vec<proto::LabelPair>> {
if desc.variable_labels.len() != label_values.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: desc.variable_labels.len(),
got: label_values.len(),
});
}
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
if total_len == 0 {
return Ok(vec![]);
}
if desc.variable_labels.is_empty() {
return Ok(desc.const_label_pairs.clone());
}
let mut label_pairs = Vec::with_capacity(total_len);
for (i, n) in desc.variable_labels.iter().enumerate() {
let mut label_pair = proto::LabelPair::default();
label_pair.set_name(n.clone());
label_pair.set_value(label_values[i].to_owned());
label_pairs.push(label_pair);
}
for label_pair in &desc.const_label_pairs {
label_pairs.push(label_pair.clone());
}
label_pairs.sort();
Ok(label_pairs)
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use prometheus::{proto, Opts};
use rand::{rngs::StdRng, Rng, SeedableRng};
use rand_distr::{Distribution, Zipf};
use crate::HyperLogLogVec;
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
let mut metrics = vec![];
hll.core
.children
.read()
.unwrap()
.values()
.for_each(|c| c.core.collect_into(&mut metrics));
metrics
}
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
let mut buckets = [0.0; 32];
for metric in metrics.chunks_exact(32) {
if filter(&metric[0]) {
for (i, m) in metric.iter().enumerate() {
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
}
}
}
buckets
.into_iter()
.map(|f| 2.0f64.powf(-f))
.sum::<f64>()
.recip()
* 0.697
* 32.0
* 32.0
}
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
let mut set_a = HashSet::new();
let mut set_b = HashSet::new();
for x in iter.by_ref().take(n) {
set_a.insert(x.to_bits());
hll.with_label_values(&["a"]).measure(&x.to_bits());
}
for x in iter.by_ref().take(n) {
set_b.insert(x.to_bits());
hll.with_label_values(&["b"]).measure(&x.to_bits());
}
let merge = &set_a | &set_b;
let metrics = collect(&hll);
let len = get_cardinality(&metrics, |_| true);
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
}
#[test]
fn test_cardinality_small() {
let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
assert_eq!(actual, [46, 30, 32]);
assert!(51.3 < estimate[0] && estimate[0] < 51.4);
assert!(44.0 < estimate[1] && estimate[1] < 44.1);
assert!(39.0 < estimate[2] && estimate[2] < 39.1);
}
#[test]
fn test_cardinality_medium() {
let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
assert_eq!(actual, [2529, 1618, 1629]);
assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
}
#[test]
fn test_cardinality_large() {
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
assert_eq!(actual, [129077, 79579, 79630]);
assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
}
#[test]
fn test_cardinality_small2() {
let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
assert_eq!(actual, [92, 58, 60]);
assert!(116.1 < estimate[0] && estimate[0] < 116.2);
assert!(81.7 < estimate[1] && estimate[1] < 81.8);
assert!(69.3 < estimate[2] && estimate[2] < 69.4);
}
#[test]
fn test_cardinality_medium2() {
let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
assert_eq!(actual, [8201, 5131, 5051]);
assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
}
#[test]
fn test_cardinality_large2() {
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
assert_eq!(actual, [777847, 482069, 482246]);
assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
}
}

View File

@@ -28,11 +28,7 @@ use prometheus::{Registry, Result};
pub mod launch_timestamp;
mod wrappers;
pub use wrappers::{CountedReader, CountedWriter};
mod hll;
pub mod metric_vec_duration;
pub use hll::{HyperLogLog, HyperLogLogVec};
#[cfg(target_os = "linux")]
pub mod more_process_metrics;
pub type UIntGauge = GenericGauge<AtomicU64>;
pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
@@ -115,6 +111,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
// performed by the process.
// We know the size of the block, so we can determine the I/O bytes out of it.
// The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
#[allow(clippy::unnecessary_cast)]
fn update_rusage_metrics() {
let rusage_stats = get_rusage_stats();

View File

@@ -1,54 +0,0 @@
//! process metrics that the [`::prometheus`] crate doesn't provide.
// This module has heavy inspiration from the prometheus crate's `process_collector.rs`.
use crate::UIntGauge;
pub struct Collector {
descs: Vec<prometheus::core::Desc>,
vmlck: crate::UIntGauge,
}
const NMETRICS: usize = 1;
impl prometheus::core::Collector for Collector {
fn desc(&self) -> Vec<&prometheus::core::Desc> {
self.descs.iter().collect()
}
fn collect(&self) -> Vec<prometheus::proto::MetricFamily> {
let Ok(myself) = procfs::process::Process::myself() else {
return vec![];
};
let mut mfs = Vec::with_capacity(NMETRICS);
if let Ok(status) = myself.status() {
if let Some(vmlck) = status.vmlck {
self.vmlck.set(vmlck);
mfs.extend(self.vmlck.collect())
}
}
mfs
}
}
impl Collector {
pub fn new() -> Self {
let mut descs = Vec::new();
let vmlck =
UIntGauge::new("libmetrics_process_status_vmlck", "/proc/self/status vmlck").unwrap();
descs.extend(
prometheus::core::Collector::desc(&vmlck)
.into_iter()
.cloned(),
);
Self { descs, vmlck }
}
}
impl Default for Collector {
fn default() -> Self {
Self::new()
}
}

View File

@@ -20,7 +20,6 @@ strum_macros.workspace = true
hex.workspace = true
thiserror.workspace = true
humantime-serde.workspace = true
chrono.workspace = true
workspace_hack.workspace = true

View File

@@ -1,11 +1,9 @@
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize};
use std::{fmt, ops::Range};
use std::fmt;
use crate::reltag::{BlockNumber, RelTag, SlruKind};
use crate::reltag::{BlockNumber, RelTag};
/// Key used in the Repository kv-store.
///
@@ -145,390 +143,12 @@ impl Key {
}
}
// Layout of the Key address space
//
// The Key struct, used to address the underlying key-value store, consists of
// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
// all the data and metadata keys into those 18 bytes.
//
// Principles for the mapping:
//
// - Things that are often accessed or modified together, should be close to
// each other in the key space. For example, if a relation is extended by one
// block, we create a new key-value pair for the block data, and update the
// relation size entry. Because of that, the RelSize key comes after all the
// RelBlocks of a relation: the RelSize and the last RelBlock are always next
// to each other.
//
// The key space is divided into four major sections, identified by the first
// byte, and the form a hierarchy:
//
// 00 Relation data and metadata
//
// DbDir () -> (dbnode, spcnode)
// Filenodemap
// RelDir -> relnode forknum
// RelBlocks
// RelSize
//
// 01 SLRUs
//
// SlruDir kind
// SlruSegBlocks segno
// SlruSegSize
//
// 02 pg_twophase
//
// 03 misc
// Controlfile
// checkpoint
// pg_version
//
// 04 aux files
//
// Below is a full list of the keyspace allocation:
//
// DbDir:
// 00 00000000 00000000 00000000 00 00000000
//
// Filenodemap:
// 00 SPCNODE DBNODE 00000000 00 00000000
//
// RelDir:
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
//
// RelBlock:
// 00 SPCNODE DBNODE RELNODE FORK BLKNUM
//
// RelSize:
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
//
// SlruDir:
// 01 kind 00000000 00000000 00 00000000
//
// SlruSegBlock:
// 01 kind 00000001 SEGNO 00 BLKNUM
//
// SlruSegSize:
// 01 kind 00000001 SEGNO 00 FFFFFFFF
//
// TwoPhaseDir:
// 02 00000000 00000000 00000000 00 00000000
//
// TwoPhaseFile:
// 02 00000000 00000000 00000000 00 XID
//
// ControlFile:
// 03 00000000 00000000 00000000 00 00000000
//
// Checkpoint:
// 03 00000000 00000000 00000000 00 00000001
//
// AuxFiles:
// 03 00000000 00000000 00000000 00 00000002
//
//-- Section 01: relation data and metadata
pub const DBDIR_KEY: Key = Key {
field1: 0x00,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
};
#[inline(always)]
pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0xffffffff,
field5: 0xff,
field6: 0xffffffff,
}
}
#[inline(always)]
pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 0,
}
}
#[inline(always)]
pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 1,
}
}
#[inline(always)]
pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: blknum,
}
}
#[inline(always)]
pub fn rel_size_to_key(rel: RelTag) -> Key {
Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: 0xffffffff,
}
}
#[inline(always)]
pub fn rel_key_range(rel: RelTag) -> Range<Key> {
Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: 0,
}..Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum + 1,
field6: 0,
}
}
//-- Section 02: SLRUs
#[inline(always)]
pub fn slru_dir_to_key(kind: SlruKind) -> Key {
Key {
field1: 0x01,
field2: match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
},
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
#[inline(always)]
pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
Key {
field1: 0x01,
field2: match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
},
field3: 1,
field4: segno,
field5: 0,
field6: blknum,
}
}
#[inline(always)]
pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
Key {
field1: 0x01,
field2: match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
},
field3: 1,
field4: segno,
field5: 0,
field6: 0xffffffff,
}
}
#[inline(always)]
pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
let field2 = match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
};
Key {
field1: 0x01,
field2,
field3: 1,
field4: segno,
field5: 0,
field6: 0,
}..Key {
field1: 0x01,
field2,
field3: 1,
field4: segno,
field5: 1,
field6: 0,
}
}
//-- Section 03: pg_twophase
pub const TWOPHASEDIR_KEY: Key = Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
};
#[inline(always)]
pub fn twophase_file_key(xid: TransactionId) -> Key {
Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: xid,
}
}
#[inline(always)]
pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
let (next_xid, overflowed) = xid.overflowing_add(1);
Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: xid,
}..Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: u8::from(overflowed),
field6: next_xid,
}
}
//-- Section 03: Control file
pub const CONTROLFILE_KEY: Key = Key {
field1: 0x03,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
};
pub const CHECKPOINT_KEY: Key = Key {
field1: 0x03,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 1,
};
pub const AUX_FILES_KEY: Key = Key {
field1: 0x03,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 2,
};
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
#[inline(always)]
pub fn is_inherited_key(key: Key) -> bool {
key != AUX_FILES_KEY
}
#[inline(always)]
pub fn is_rel_fsm_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
}
#[inline(always)]
pub fn is_rel_vm_block_key(key: Key) -> bool {
key.field1 == 0x00
&& key.field4 != 0
&& key.field5 == VISIBILITYMAP_FORKNUM
&& key.field6 != 0xffffffff
}
#[inline(always)]
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
Ok(match key.field1 {
0x01 => {
let kind = match key.field2 {
0x00 => SlruKind::Clog,
0x01 => SlruKind::MultiXactMembers,
0x02 => SlruKind::MultiXactOffsets,
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
};
let segno = key.field4;
let blknum = key.field6;
(kind, segno, blknum)
}
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
#[inline(always)]
pub fn is_slru_block_key(key: Key) -> bool {
key.field1 == 0x01 // SLRU-related
&& key.field3 == 0x00000001 // but not SlruDir
&& key.field6 != 0xffffffff // and not SlruSegSize
}
#[inline(always)]
pub fn is_rel_block_key(key: &Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
}
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
#[inline(always)]
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (

View File

@@ -63,84 +63,16 @@ impl KeySpace {
KeyPartitioning { parts }
}
/// Update the keyspace such that it doesn't contain any range
/// that is overlapping with `other`. This can involve splitting or
/// removing of existing ranges.
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
let (self_start, self_end) = match (self.start(), self.end()) {
(Some(start), Some(end)) => (start, end),
_ => {
// self is empty
return;
}
};
// Key spaces are sorted by definition, so skip ahead to the first
// potentially intersecting range. Similarly, ignore ranges that start
// after the current keyspace ends.
let other_ranges = other
.ranges
.iter()
.skip_while(|range| self_start >= range.end)
.take_while(|range| self_end > range.start);
for range in other_ranges {
while let Some(overlap_at) = self.overlaps_at(range) {
let overlapped = self.ranges[overlap_at].clone();
if overlapped.start < range.start && overlapped.end <= range.end {
// Higher part of the range is completely overlapped.
self.ranges[overlap_at].end = range.start;
}
if overlapped.start >= range.start && overlapped.end > range.end {
// Lower part of the range is completely overlapped.
self.ranges[overlap_at].start = range.end;
}
if overlapped.start < range.start && overlapped.end > range.end {
// Middle part of the range is overlapped.
self.ranges[overlap_at].end = range.start;
self.ranges
.insert(overlap_at + 1, range.end..overlapped.end);
}
if overlapped.start >= range.start && overlapped.end <= range.end {
// Whole range is overlapped
self.ranges.remove(overlap_at);
}
}
}
}
pub fn start(&self) -> Option<Key> {
self.ranges.first().map(|range| range.start)
}
pub fn end(&self) -> Option<Key> {
self.ranges.last().map(|range| range.end)
}
#[allow(unused)]
pub fn total_size(&self) -> usize {
self.ranges
.iter()
.map(|range| key_range_size(range) as usize)
.sum()
}
fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
Ok(0) => None,
Err(0) => None,
Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
_ => None,
}
}
///
/// Check if key space contains overlapping range
///
pub fn overlaps(&self, range: &Range<Key>) -> bool {
self.overlaps_at(range).is_some()
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
Ok(0) => false,
Err(0) => false,
Ok(index) => self.ranges[index - 1].end > range.start,
Err(index) => self.ranges[index - 1].end > range.start,
}
}
}
@@ -172,7 +104,6 @@ pub struct KeySpaceAccum {
accum: Option<Range<Key>>,
ranges: Vec<Range<Key>>,
size: u64,
}
impl KeySpaceAccum {
@@ -180,7 +111,6 @@ impl KeySpaceAccum {
Self {
accum: None,
ranges: Vec::new(),
size: 0,
}
}
@@ -191,8 +121,6 @@ impl KeySpaceAccum {
#[inline(always)]
pub fn add_range(&mut self, range: Range<Key>) {
self.size += key_range_size(&range) as u64;
match self.accum.as_mut() {
Some(accum) => {
if range.start == accum.end {
@@ -218,23 +146,6 @@ impl KeySpaceAccum {
ranges: self.ranges,
}
}
pub fn consume_keyspace(&mut self) -> KeySpace {
if let Some(accum) = self.accum.take() {
self.ranges.push(accum);
}
let mut prev_accum = KeySpaceAccum::new();
std::mem::swap(self, &mut prev_accum);
KeySpace {
ranges: prev_accum.ranges,
}
}
pub fn size(&self) -> u64 {
self.size
}
}
///
@@ -343,30 +254,6 @@ mod tests {
}
}
#[test]
fn keyspace_consume() {
let ranges = vec![kr(0..10), kr(20..35), kr(40..45)];
let mut accum = KeySpaceAccum::new();
for range in &ranges {
accum.add_range(range.clone());
}
let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
assert_eq!(accum.size(), expected_size);
assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
assert_eq!(accum.size(), 0);
assert_ks_eq(&accum.consume_keyspace(), vec![]);
assert_eq!(accum.size(), 0);
for range in &ranges {
accum.add_range(range.clone());
}
assert_ks_eq(&accum.to_keyspace(), ranges);
}
#[test]
fn keyspace_add_range() {
// two separate ranges
@@ -509,118 +396,4 @@ mod tests {
// xxxxxxxxxxx
assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
}
#[test]
fn test_remove_full_overlapps() {
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(4),
Key::from_i128(5)..Key::from_i128(8),
Key::from_i128(10)..Key::from_i128(12),
],
};
let key_space2 = KeySpace {
ranges: vec![
Key::from_i128(2)..Key::from_i128(3),
Key::from_i128(6)..Key::from_i128(7),
Key::from_i128(11)..Key::from_i128(13),
],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(2),
Key::from_i128(3)..Key::from_i128(4),
Key::from_i128(5)..Key::from_i128(6),
Key::from_i128(7)..Key::from_i128(8),
Key::from_i128(10)..Key::from_i128(11)
]
);
}
#[test]
fn test_remove_partial_overlaps() {
// Test partial ovelaps
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(5),
Key::from_i128(7)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
],
};
let key_space2 = KeySpace {
ranges: vec![
Key::from_i128(3)..Key::from_i128(6),
Key::from_i128(8)..Key::from_i128(11),
Key::from_i128(14)..Key::from_i128(17),
],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(3),
Key::from_i128(7)..Key::from_i128(8),
Key::from_i128(12)..Key::from_i128(14),
]
);
}
#[test]
fn test_remove_no_overlaps() {
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(5),
Key::from_i128(7)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
],
};
let key_space2 = KeySpace {
ranges: vec![
Key::from_i128(6)..Key::from_i128(7),
Key::from_i128(11)..Key::from_i128(12),
Key::from_i128(15)..Key::from_i128(17),
],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(5),
Key::from_i128(7)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
]
);
}
#[test]
fn test_remove_one_range_overlaps_multiple() {
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(3),
Key::from_i128(3)..Key::from_i128(6),
Key::from_i128(6)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
Key::from_i128(17)..Key::from_i128(20),
Key::from_i128(20)..Key::from_i128(30),
Key::from_i128(30)..Key::from_i128(40),
],
};
let key_space2 = KeySpace {
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(3),
Key::from_i128(3)..Key::from_i128(6),
Key::from_i128(6)..Key::from_i128(9),
Key::from_i128(19)..Key::from_i128(20),
Key::from_i128(20)..Key::from_i128(30),
Key::from_i128(30)..Key::from_i128(40),
]
);
}
}

View File

@@ -8,7 +8,6 @@ use std::{
};
use byteorder::{BigEndian, ReadBytesExt};
use postgres_ffi::BLCKSZ;
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use strum_macros;
@@ -192,16 +191,6 @@ pub struct TimelineCreateRequest {
pub pg_version: Option<u32>,
}
#[derive(Serialize, Deserialize)]
pub struct TenantShardSplitRequest {
pub new_shard_count: u8,
}
#[derive(Serialize, Deserialize)]
pub struct TenantShardSplitResponse {
pub new_shards: Vec<TenantShardId>,
}
/// Parameters that apply to all shards in a tenant. Used during tenant creation.
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
@@ -214,14 +203,14 @@ impl ShardParameters {
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
pub fn is_unsharded(&self) -> bool {
self.count.is_unsharded()
self.count == ShardCount(0)
}
}
impl Default for ShardParameters {
fn default() -> Self {
Self {
count: ShardCount::new(0),
count: ShardCount(0),
stripe_size: Self::DEFAULT_STRIPE_SIZE,
}
}
@@ -282,8 +271,6 @@ pub struct TenantConfig {
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
pub heatmap_period: Option<String>,
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -310,35 +297,6 @@ pub struct EvictionPolicyLayerAccessThreshold {
pub threshold: Duration,
}
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub struct ThrottleConfig {
pub task_kinds: Vec<String>, // TaskKind
pub initial: usize,
#[serde(with = "humantime_serde")]
pub refill_interval: Duration,
pub refill_amount: NonZeroUsize,
pub max: usize,
pub fair: bool,
}
impl ThrottleConfig {
pub fn disabled() -> Self {
Self {
task_kinds: vec![], // effectively disables the throttle
// other values don't matter with emtpy `task_kinds`.
initial: 0,
refill_interval: Duration::from_millis(1),
refill_amount: NonZeroUsize::new(1).unwrap(),
max: 1,
fair: true,
}
}
/// The requests per second allowed by the given config.
pub fn steady_rps(&self) -> f64 {
(self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
}
}
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
/// lists out all possible states (and the virtual "Detached" state)
/// in a flat form rather than using rust-style enums.
@@ -406,19 +364,6 @@ pub struct TenantLocationConfigRequest {
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantShardLocation {
pub shard_id: TenantShardId,
pub node_id: NodeId,
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLocationConfigResponse {
pub shards: Vec<TenantShardLocation>,
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantConfigRequest {
@@ -494,8 +439,6 @@ pub struct TenantDetails {
#[serde(flatten)]
pub tenant_info: TenantInfo,
pub walredo: Option<WalRedoManagerStatus>,
pub timelines: Vec<TimelineId>,
}
@@ -524,8 +467,6 @@ pub struct TimelineInfo {
pub current_logical_size: u64,
pub current_logical_size_is_accurate: bool,
pub directory_entries_counts: Vec<u64>,
/// Sum of the size of all layer files.
/// If a layer is present in both local FS and S3, it counts only once.
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
@@ -685,33 +626,6 @@ pub struct TimelineGcRequest {
pub gc_horizon: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerStatus {
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
pub pid: Option<u32>,
}
pub mod virtual_file {
#[derive(
Copy,
Clone,
PartialEq,
Eq,
Hash,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
Debug,
)]
#[strum(serialize_all = "kebab-case")]
pub enum IoEngineKind {
StdFs,
#[cfg(target_os = "linux")]
TokioEpollUring,
}
}
// Wrapped in libpq CopyData
#[derive(PartialEq, Eq, Debug)]
pub enum PagestreamFeMessage {
@@ -719,7 +633,6 @@ pub enum PagestreamFeMessage {
Nblocks(PagestreamNblocksRequest),
GetPage(PagestreamGetPageRequest),
DbSize(PagestreamDbSizeRequest),
GetSlruSegment(PagestreamGetSlruSegmentRequest),
}
// Wrapped in libpq CopyData
@@ -730,7 +643,6 @@ pub enum PagestreamBeMessage {
GetPage(PagestreamGetPageResponse),
Error(PagestreamErrorResponse),
DbSize(PagestreamDbSizeResponse),
GetSlruSegment(PagestreamGetSlruSegmentResponse),
}
// Keep in sync with `pagestore_client.h`
@@ -741,7 +653,6 @@ enum PagestreamBeMessageTag {
GetPage = 102,
Error = 103,
DbSize = 104,
GetSlruSegment = 105,
}
impl TryFrom<u8> for PagestreamBeMessageTag {
type Error = u8;
@@ -752,7 +663,6 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
102 => Ok(PagestreamBeMessageTag::GetPage),
103 => Ok(PagestreamBeMessageTag::Error),
104 => Ok(PagestreamBeMessageTag::DbSize),
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
_ => Err(value),
}
}
@@ -787,14 +697,6 @@ pub struct PagestreamDbSizeRequest {
pub dbnode: u32,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetSlruSegmentRequest {
pub latest: bool,
pub lsn: Lsn,
pub kind: u8,
pub segno: u32,
}
#[derive(Debug)]
pub struct PagestreamExistsResponse {
pub exists: bool,
@@ -810,11 +712,6 @@ pub struct PagestreamGetPageResponse {
pub page: Bytes,
}
#[derive(Debug)]
pub struct PagestreamGetSlruSegmentResponse {
pub segment: Bytes,
}
#[derive(Debug)]
pub struct PagestreamErrorResponse {
pub message: String,
@@ -878,14 +775,6 @@ impl PagestreamFeMessage {
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.dbnode);
}
Self::GetSlruSegment(req) => {
bytes.put_u8(4);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u8(req.kind);
bytes.put_u32(req.segno);
}
}
bytes.into()
@@ -936,14 +825,6 @@ impl PagestreamFeMessage {
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
dbnode: body.read_u32::<BigEndian>()?,
})),
4 => Ok(PagestreamFeMessage::GetSlruSegment(
PagestreamGetSlruSegmentRequest {
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
kind: body.read_u8()?,
segno: body.read_u32::<BigEndian>()?,
},
)),
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
}
}
@@ -979,12 +860,6 @@ impl PagestreamBeMessage {
bytes.put_u8(Tag::DbSize as u8);
bytes.put_i64(resp.db_size);
}
Self::GetSlruSegment(resp) => {
bytes.put_u8(Tag::GetSlruSegment as u8);
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
}
bytes.into()
@@ -1025,14 +900,6 @@ impl PagestreamBeMessage {
let db_size = buf.read_i64::<BigEndian>()?;
Self::DbSize(PagestreamDbSizeResponse { db_size })
}
Tag::GetSlruSegment => {
let n_blocks = buf.read_u32::<BigEndian>()?;
let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
buf.read_exact(&mut segment)?;
Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
segment: segment.into(),
})
}
};
let remaining = buf.into_inner();
if !remaining.is_empty() {
@@ -1051,7 +918,6 @@ impl PagestreamBeMessage {
Self::GetPage(_) => "GetPage",
Self::Error(_) => "Error",
Self::DbSize(_) => "DbSize",
Self::GetSlruSegment(_) => "GetSlruSegment",
}
}
}

View File

@@ -111,24 +111,9 @@ impl RelTag {
/// These files are divided into segments, which are divided into
/// pages of the same BLCKSZ as used for relation files.
///
#[derive(
Debug,
Clone,
Copy,
Hash,
Serialize,
Deserialize,
PartialEq,
Eq,
PartialOrd,
Ord,
strum_macros::EnumIter,
strum_macros::FromRepr,
enum_map::Enum,
)]
#[repr(u8)]
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub enum SlruKind {
Clog = 0,
Clog,
MultiXactMembers,
MultiXactOffsets,
}

View File

@@ -13,41 +13,10 @@ use utils::id::TenantId;
pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardCount(u8);
pub struct ShardCount(pub u8);
impl ShardCount {
pub const MAX: Self = Self(u8::MAX);
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
/// legacy format for TenantShardId that excludes the shard suffix", also known
/// as `TenantShardId::unsharded`.
///
/// This method returns the actual number of shards, i.e. if our internal value is
/// zero, we return 1 (unsharded tenants have 1 shard).
pub fn count(&self) -> u8 {
if self.0 > 0 {
self.0
} else {
1
}
}
/// The literal internal value: this is **not** the number of shards in the
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
/// [`Self::count`] if you want to know the cardinality of shards.
pub fn literal(&self) -> u8 {
self.0
}
pub fn is_unsharded(&self) -> bool {
self.0 == 0
}
/// `v` may be zero, or the number of shards in the tenant. `v` is what
/// [`Self::literal`] would return.
pub fn new(val: u8) -> Self {
Self(val)
}
}
impl ShardNumber {
@@ -117,38 +86,14 @@ impl TenantShardId {
}
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
}
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
/// is useful when logging from code that is already in a span that includes tenant ID, to
/// keep messages reasonably terse.
pub fn to_index(&self) -> ShardIndex {
ShardIndex {
shard_number: self.shard_number,
shard_count: self.shard_count,
}
}
/// Calculate the children of this TenantShardId when splitting the overall tenant into
/// the given number of shards.
pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
let mut child_shards = Vec::new();
for shard_number in 0..ShardNumber(new_shard_count.0).0 {
// Key mapping is based on a round robin mapping of key hash modulo shard count,
// so our child shards are the ones which the same keys would map to.
if shard_number % effective_old_shard_count == self.shard_number.0 {
child_shards.push(TenantShardId {
tenant_id: self.tenant_id,
shard_number: ShardNumber(shard_number),
shard_count: new_shard_count,
})
}
}
child_shards
}
}
/// Formatting helper
@@ -502,12 +447,10 @@ impl ShardIdentity {
pub fn is_key_disposable(&self, key: &Key) -> bool {
if key_is_shard0(key) {
// Q: Why can't we dispose of shard0 content if we're not shard 0?
// A1: because the WAL ingestion logic currently ingests some shard 0
// content on all shards, even though it's only read on shard 0. If we
// dropped it, then subsequent WAL ingest to these keys would encounter
// an error.
// A2: because key_is_shard0 also covers relation size keys, which are written
// on all shards even though they're only maintained accurately on shard 0.
// A: because the WAL ingestion logic currently ingests some shard 0
// content on all shards, even though it's only read on shard 0. If we
// dropped it, then subsequent WAL ingest to these keys would encounter
// an error.
false
} else {
!self.is_key_local(key)
@@ -850,108 +793,4 @@ mod tests {
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
assert_eq!(shard, ShardNumber(8));
}
#[test]
fn shard_id_split() {
let tenant_id = TenantId::generate();
let parent = TenantShardId::unsharded(tenant_id);
// Unsharded into 2
assert_eq!(
parent.split(ShardCount(2)),
vec![
TenantShardId {
tenant_id,
shard_count: ShardCount(2),
shard_number: ShardNumber(0)
},
TenantShardId {
tenant_id,
shard_count: ShardCount(2),
shard_number: ShardNumber(1)
}
]
);
// Unsharded into 4
assert_eq!(
parent.split(ShardCount(4)),
vec![
TenantShardId {
tenant_id,
shard_count: ShardCount(4),
shard_number: ShardNumber(0)
},
TenantShardId {
tenant_id,
shard_count: ShardCount(4),
shard_number: ShardNumber(1)
},
TenantShardId {
tenant_id,
shard_count: ShardCount(4),
shard_number: ShardNumber(2)
},
TenantShardId {
tenant_id,
shard_count: ShardCount(4),
shard_number: ShardNumber(3)
}
]
);
// count=1 into 2 (check this works the same as unsharded.)
let parent = TenantShardId {
tenant_id,
shard_count: ShardCount(1),
shard_number: ShardNumber(0),
};
assert_eq!(
parent.split(ShardCount(2)),
vec![
TenantShardId {
tenant_id,
shard_count: ShardCount(2),
shard_number: ShardNumber(0)
},
TenantShardId {
tenant_id,
shard_count: ShardCount(2),
shard_number: ShardNumber(1)
}
]
);
// count=2 into count=8
let parent = TenantShardId {
tenant_id,
shard_count: ShardCount(2),
shard_number: ShardNumber(1),
};
assert_eq!(
parent.split(ShardCount(8)),
vec![
TenantShardId {
tenant_id,
shard_count: ShardCount(8),
shard_number: ShardNumber(1)
},
TenantShardId {
tenant_id,
shard_count: ShardCount(8),
shard_number: ShardNumber(3)
},
TenantShardId {
tenant_id,
shard_count: ShardCount(8),
shard_number: ShardNumber(5)
},
TenantShardId {
tenant_id,
shard_count: ShardCount(8),
shard_number: ShardNumber(7)
},
]
);
}
}

Some files were not shown because too many files have changed in this diff Show More