Compare commits

..

1 Commits

Author SHA1 Message Date
Anna Khanova
136ed19387 Test 2024-03-27 13:42:33 +01:00
427 changed files with 13210 additions and 32571 deletions

View File

@@ -1,2 +1,2 @@
[profile.default] [profile.default]
slow-timeout = { period = "60s", terminate-after = 3 } slow-timeout = { period = "20s", terminate-after = 3 }

View File

@@ -22,7 +22,6 @@
!s3_scrubber/ !s3_scrubber/
!safekeeper/ !safekeeper/
!storage_broker/ !storage_broker/
!storage_controller/
!trace/ !trace/
!vendor/postgres-*/ !vendor/postgres-*/
!workspace_hack/ !workspace_hack/

View File

@@ -1,11 +1,12 @@
self-hosted-runner: self-hosted-runner:
labels: labels:
- arm64 - arm64
- dev
- gen3 - gen3
- large - large
- large-arm64 # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
- macos-14
- small - small
- small-arm64
- us-east-2 - us-east-2
config-variables: config-variables:
- REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_CONTAINER

View File

@@ -150,7 +150,7 @@ runs:
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work, # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
# and to keep files on the host to upload them to the database # and to keep files on the host to upload them to the database
time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/" time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
# Generate redirect # Generate redirect
cat <<EOF > ${WORKDIR}/index.html cat <<EOF > ${WORKDIR}/index.html

View File

@@ -10,7 +10,7 @@ inputs:
required: true required: true
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
outputs: outputs:
dsn: dsn:
description: 'Created Branch DSN (for main database)' description: 'Created Branch DSN (for main database)'

View File

@@ -13,7 +13,7 @@ inputs:
required: true required: true
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
runs: runs:
using: "composite" using: "composite"

View File

@@ -13,7 +13,7 @@ inputs:
default: 15 default: 15
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
provisioner: provisioner:
desctiption: 'k8s-pod or k8s-neonvm' desctiption: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod' default: 'k8s-pod'

View File

@@ -10,7 +10,7 @@ inputs:
required: true required: true
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
runs: runs:
using: "composite" using: "composite"

View File

@@ -18,7 +18,6 @@ on:
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }} group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: false
env: env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -147,16 +147,15 @@ jobs:
"neonvm-captest-new" "neonvm-captest-new"
], ],
"db_size": [ "10gb" ], "db_size": [ "10gb" ],
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "platform": "neon-captest-new", "db_size": "50gb" }, { "platform": "neon-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" }, { "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "platform": "neonvm-captest-new", "db_size": "50gb" }, { "platform": "neonvm-captest-new", "db_size": "50gb" }]
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
}' }'
if [ "$(date +%A)" = "Saturday" ]; then if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
{ "platform": "rds-aurora", "db_size": "50gb"}]') { "platform": "rds-aurora", "db_size": "50gb"}]')
fi fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -172,7 +171,7 @@ jobs:
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
{ "platform": "rds-aurora" }]') { "platform": "rds-aurora" }]')
fi fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -191,7 +190,7 @@ jobs:
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
{ "platform": "rds-aurora", "scale": "10" }]') { "platform": "rds-aurora", "scale": "10" }]')
fi fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -254,9 +253,6 @@ jobs:
neon-captest-reuse) neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;; ;;
neonvm-captest-sharding-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
;;
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }} CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;; ;;
@@ -274,15 +270,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()") QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id") QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
QUERIES+=("SHOW neon.timeline_id")
fi fi
psql ${CONNSTR} -c "${QUERY}"
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Benchmark init - name: Benchmark init
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set
@@ -409,15 +401,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()") QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id") QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
QUERIES+=("SHOW neon.timeline_id")
fi fi
psql ${CONNSTR} -c "${QUERY}"
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: ClickBench benchmark - name: ClickBench benchmark
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set
@@ -519,15 +507,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()") QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id") QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
QUERIES+=("SHOW neon.timeline_id")
fi fi
psql ${CONNSTR} -c "${QUERY}"
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Run TPC-H benchmark - name: Run TPC-H benchmark
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set
@@ -613,15 +597,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()") QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id") QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
QUERIES+=("SHOW neon.timeline_id")
fi fi
psql ${CONNSTR} -c "${QUERY}"
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Run user examples - name: Run user examples
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set

View File

@@ -21,7 +21,6 @@ defaults:
concurrency: concurrency:
group: build-build-tools-image-${{ inputs.image-tag }} group: build-build-tools-image-${{ inputs.image-tag }}
cancel-in-progress: false
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {} permissions: {}
@@ -39,7 +38,7 @@ jobs:
matrix: matrix:
arch: [ x64, arm64 ] arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
env: env:
IMAGE_TAG: ${{ inputs.image-tag }} IMAGE_TAG: ${{ inputs.image-tag }}

View File

@@ -236,6 +236,27 @@ jobs:
submodules: true submodules: true
fetch-depth: 1 fetch-depth: 1
- name: Check Postgres submodules revision
shell: bash -euo pipefail {0}
run: |
# This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
FAILED=false
for postgres in postgres-v14 postgres-v15 postgres-v16; do
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
actual=$(git rev-parse "HEAD:vendor/${postgres}")
if [ "${expected}" != "${actual}" ]; then
echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
FAILED=true
fi
done
if [ "${FAILED}" = "true" ]; then
echo >&2 "Please update vendor/revisions.json if these changes are intentional"
exit 1
fi
- name: Set pg 14 revision for caching - name: Set pg 14 revision for caching
id: pg_v14_rev id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
@@ -341,9 +362,6 @@ jobs:
env: env:
NEXTEST_RETRIES: 3 NEXTEST_RETRIES: 3
run: | run: |
#nextest does not yet support running doctests
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
for io_engine in std-fs tokio-epoll-uring ; do for io_engine in std-fs tokio-epoll-uring ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
done done
@@ -459,8 +477,6 @@ jobs:
BUILD_TAG: ${{ needs.tag.outputs.build-tag }} BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_IMPL: vectored PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
PAGESERVER_VALIDATE_VEC_GET: true
# Temporary disable this step until we figure out why it's so flaky # Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540 # Ref https://github.com/neondatabase/neon/issues/4540
@@ -540,33 +556,12 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
PAGESERVER_VALIDATE_VEC_GET: false
# XXX: no coverage data handling here, since benchmarks are run on release builds, # XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones # while coverage is currently collected for the debug ones
report-benchmarks-failures:
needs: [ benchmarks, create-test-report ]
if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
runs-on: ubuntu-latest
steps:
- uses: slackapi/slack-github-action@v1
with:
channel-id: C060CNA47S9 # on-call-staging-storage-stream
slack-message: |
Benchmarks failed on main: ${{ github.event.head_commit.url }}
Allure report: ${{ needs.create-test-report.outputs.report-url }}
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
create-test-report: create-test-report:
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ] needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
outputs:
report-url: ${{ steps.create-allure-report.outputs.report-url }}
runs-on: [ self-hosted, gen3, small ] runs-on: [ self-hosted, gen3, small ]
container: container:
@@ -740,7 +735,7 @@ jobs:
run: | run: |
mkdir -p .docker-custom mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v2 - uses: docker/setup-buildx-action@v3
- uses: docker/login-action@v3 - uses: docker/login-action@v3
with: with:
@@ -797,7 +792,7 @@ jobs:
run: | run: |
mkdir -p .docker-custom mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v2 - uses: docker/setup-buildx-action@v3
with: with:
# Disable parallelism for docker buildkit. # Disable parallelism for docker buildkit.
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -870,7 +865,7 @@ jobs:
run: run:
shell: sh -eu {0} shell: sh -eu {0}
env: env:
VM_BUILDER_VERSION: v0.28.1 VM_BUILDER_VERSION: v0.23.2
steps: steps:
- name: Checkout - name: Checkout
@@ -1132,15 +1127,15 @@ jobs:
-f deployProxy=false \ -f deployProxy=false \
-f deployStorage=true \ -f deployStorage=true \
-f deployStorageBroker=true \ -f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \ -f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} \ -f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true -f deployPreprodRegion=true
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \ -f deployStorage=true \
-f deployStorageBroker=true \ -f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \ -f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} -f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
@@ -1149,7 +1144,6 @@ jobs:
-f deployProxy=true \ -f deployProxy=true \
-f deployStorage=false \ -f deployStorage=false \
-f deployStorageBroker=false \ -f deployStorageBroker=false \
-f deployStorageController=false \
-f branch=main \ -f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} \ -f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true -f deployPreprodRegion=true

View File

@@ -28,9 +28,7 @@ jobs:
- name: Get build-tools image tag for the current commit - name: Get build-tools image tag for the current commit
id: get-build-tools-tag id: get-build-tools-tag
env: env:
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs, COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
COMMIT_SHA: ${{ github.sha }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: | run: |
LAST_BUILD_TOOLS_SHA=$( LAST_BUILD_TOOLS_SHA=$(

View File

@@ -136,7 +136,7 @@ jobs:
check-linux-arm-build: check-linux-arm-build:
needs: [ check-permissions, build-build-tools-image ] needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90 timeout-minutes: 90
runs-on: [ self-hosted, small-arm64 ] runs-on: [ self-hosted, dev, arm64 ]
env: env:
# Use release build only, to have less debug info around # Use release build only, to have less debug info around
@@ -232,20 +232,20 @@ jobs:
- name: Run cargo build - name: Run cargo build
run: | run: |
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc) mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run cargo test - name: Run cargo test
env: env:
NEXTEST_RETRIES: 3 NEXTEST_RETRIES: 3
run: | run: |
cargo nextest run $CARGO_FEATURES -j$(nproc) cargo nextest run $CARGO_FEATURES
# Run separate tests for real S3 # Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1 export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc) cargo nextest run --package remote_storage --test test_real_s3
# Run separate tests for real Azure Blob Storage # Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region # XXX: replace region with `eu-central-1`-like region
@@ -255,12 +255,12 @@ jobs:
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_azure -j$(nproc) cargo nextest run --package remote_storage --test test_real_azure
check-codestyle-rust-arm: check-codestyle-rust-arm:
needs: [ check-permissions, build-build-tools-image ] needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90 timeout-minutes: 90
runs-on: [ self-hosted, small-arm64 ] runs-on: [ self-hosted, dev, arm64 ]
container: container:
image: ${{ needs.build-build-tools-image.outputs.image }} image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -269,11 +269,6 @@ jobs:
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init options: --init
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
steps: steps:
- name: Fix git ownership - name: Fix git ownership
run: | run: |
@@ -310,35 +305,31 @@ jobs:
exit 1 exit 1
fi fi
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug) - name: Run cargo clippy (debug)
if: matrix.build_type == 'debug'
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Run cargo clippy (release) - name: Run cargo clippy (release)
if: matrix.build_type == 'release'
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation - name: Check documentation generation
if: matrix.build_type == 'release' run: cargo doc --workspace --no-deps --document-private-items
run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
env: env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting - name: Check formatting
if: ${{ !cancelled() && matrix.build_type == 'release' }} if: ${{ !cancelled() }}
run: cargo fmt --all -- --check run: cargo fmt --all -- --check
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
- name: Check rust dependencies - name: Check rust dependencies
if: ${{ !cancelled() && matrix.build_type == 'release' }} if: ${{ !cancelled() }}
run: | run: |
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny # https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources - name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() && matrix.build_type == 'release' }} if: ${{ !cancelled() }}
run: cargo deny check run: cargo deny check
gather-rust-build-stats: gather-rust-build-stats:
@@ -347,7 +338,7 @@ jobs:
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main' github.ref_name == 'main'
runs-on: [ self-hosted, large ] runs-on: [ self-hosted, gen3, large ]
container: container:
image: ${{ needs.build-build-tools-image.outputs.image }} image: ${{ needs.build-build-tools-image.outputs.image }}
credentials: credentials:
@@ -378,7 +369,7 @@ jobs:
run: make walproposer-lib -j$(nproc) run: make walproposer-lib -j$(nproc)
- name: Produce the build stats - name: Produce the build stats
run: cargo build --all --release --timings -j$(nproc) run: cargo build --all --release --timings
- name: Upload the build stats - name: Upload the build stats
id: upload-stats id: upload-stats

View File

@@ -20,7 +20,6 @@ defaults:
concurrency: concurrency:
group: pin-build-tools-image-${{ inputs.from-tag }} group: pin-build-tools-image-${{ inputs.from-tag }}
cancel-in-progress: false
permissions: {} permissions: {}

View File

@@ -62,14 +62,14 @@ jobs:
trigger-e2e-tests: trigger-e2e-tests:
needs: [ tag ] needs: [ tag ]
runs-on: ubuntu-latest runs-on: [ self-hosted, gen3, small ]
env: env:
TAG: ${{ needs.tag.outputs.build-tag }} TAG: ${{ needs.tag.outputs.build-tag }}
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
steps: steps:
- name: check if ecr image are present - name: check if ecr image are present
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
run: | run: |
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text) OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,55 +79,41 @@ jobs:
fi fi
done done
- name: Set e2e-platforms
id: e2e-platforms
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Default set of platforms to run e2e tests on
platforms='["docker", "k8s"]'
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
# If the workflow run is not a pull request, add k8s-neonvm to the list.
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
case "$f" in
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
;;
*)
# no-op
;;
esac
done
else
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
fi
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
- name: Set PR's status to pending and request a remote CI test - name: Set PR's status to pending and request a remote CI test
env:
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: | run: |
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud" # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
# to place a job run status update later.
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \ REMOTE_REPO="${{ github.repository_owner }}/cloud"
--method POST \
--raw-field "state=pending" \
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
--raw-field "context=neon-cloud-e2e"
gh workflow --repo ${REMOTE_REPO} \ curl -f -X POST \
run testing.yml \ https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
--ref "main" \ -H "Accept: application/vnd.github.v3+json" \
--raw-field "ci_job_name=neon-cloud-e2e" \ --user "${{ secrets.CI_ACCESS_TOKEN }}" \
--raw-field "commit_hash=$COMMIT_SHA" \ --data \
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \ "{
--raw-field "storage_image_tag=${TAG}" \ \"state\": \"pending\",
--raw-field "compute_image_tag=${TAG}" \ \"context\": \"neon-cloud-e2e\",
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \ \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
--raw-field "e2e-platforms=${E2E_PLATFORMS}" }"
curl -f -X POST \
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"ref\": \"main\",
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${TAG}\",
\"compute_image_tag\": \"${TAG}\",
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
}
}"

View File

@@ -1,5 +1,5 @@
/compute_tools/ @neondatabase/control-plane @neondatabase/compute /compute_tools/ @neondatabase/control-plane @neondatabase/compute
/storage_controller @neondatabase/storage /control_plane/attachment_service @neondatabase/storage
/libs/pageserver_api/ @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
/libs/remote_storage/ @neondatabase/storage /libs/remote_storage/ @neondatabase/storage

1034
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -3,7 +3,7 @@ resolver = "2"
members = [ members = [
"compute_tools", "compute_tools",
"control_plane", "control_plane",
"control_plane/storcon_cli", "control_plane/attachment_service",
"pageserver", "pageserver",
"pageserver/compaction", "pageserver/compaction",
"pageserver/ctl", "pageserver/ctl",
@@ -12,7 +12,6 @@ members = [
"proxy", "proxy",
"safekeeper", "safekeeper",
"storage_broker", "storage_broker",
"storage_controller",
"s3_scrubber", "s3_scrubber",
"workspace_hack", "workspace_hack",
"trace", "trace",
@@ -44,22 +43,21 @@ license = "Apache-2.0"
anyhow = { version = "1.0", features = ["backtrace"] } anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6" arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0" azure_core = "0.18"
azure_core = "0.19" azure_identity = "0.18"
azure_identity = "0.19" azure_storage = "0.18"
azure_storage = "0.19" azure_storage_blobs = "0.18"
azure_storage_blobs = "0.19"
flate2 = "1.0.26" flate2 = "1.0.26"
async-stream = "0.3" async-stream = "0.3"
async-trait = "0.1" async-trait = "0.1"
aws-config = { version = "1.3", default-features = false, features=["rustls"] } aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.26" aws-sdk-s3 = "1.14"
aws-sdk-iam = "1.15.0" aws-sdk-iam = "1.15.0"
aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.1.9" aws-smithy-types = "1.1.4"
aws-credential-types = "1.2.0" aws-credential-types = "1.1.4"
aws-sigv4 = { version = "1.2.1", features = ["sign-http"] } aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
aws-types = "1.2.0" aws-types = "1.1.7"
axum = { version = "0.6.20", features = ["ws"] } axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0" base64 = "0.13.0"
bincode = "1.3" bincode = "1.3"
@@ -81,14 +79,13 @@ enum-map = "2.4.2"
enumset = "1.0.12" enumset = "1.0.12"
fail = "0.5.0" fail = "0.5.0"
fallible-iterator = "0.2" fallible-iterator = "0.2"
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
fs2 = "0.4.3" fs2 = "0.4.3"
futures = "0.3" futures = "0.3"
futures-core = "0.3" futures-core = "0.3"
futures-util = "0.3" futures-util = "0.3"
git-version = "0.3" git-version = "0.3"
hashbrown = "0.14" hashbrown = "0.13"
hashlink = "0.9.1" hashlink = "0.8.4"
hdrhistogram = "7.5.2" hdrhistogram = "7.5.2"
hex = "0.4" hex = "0.4"
hex-literal = "0.4" hex-literal = "0.4"
@@ -99,8 +96,7 @@ http-types = { version = "2", default-features = false }
humantime = "2.1" humantime = "2.1"
humantime-serde = "1.1.1" humantime-serde = "1.1.1"
hyper = "0.14" hyper = "0.14"
tokio-tungstenite = "0.20.0" hyper-tungstenite = "0.11"
indexmap = "2"
inotify = "0.10.2" inotify = "0.10.2"
ipnet = "2.9.0" ipnet = "2.9.0"
itertools = "0.10" itertools = "0.10"
@@ -109,8 +105,7 @@ lasso = "0.7"
leaky-bucket = "1.0.1" leaky-bucket = "1.0.1"
libc = "0.2" libc = "0.2"
md5 = "0.7.0" md5 = "0.7.0"
measured = { version = "0.0.21", features=["lasso"] } measured = { version = "0.0.13", features=["default", "lasso"] }
measured-process = { version = "0.0.21" }
memoffset = "0.8" memoffset = "0.8"
native-tls = "0.2" native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -122,8 +117,8 @@ opentelemetry = "0.20.0"
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.12.0" opentelemetry-semantic-conventions = "0.12.0"
parking_lot = "0.12" parking_lot = "0.12"
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] } parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "51.0.0" parquet_derive = "49.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2" pin-project-lite = "0.2"
procfs = "0.14" procfs = "0.14"
@@ -132,10 +127,10 @@ prost = "0.11"
rand = "0.8" rand = "0.8"
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2" regex = "1.10.2"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] } reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
reqwest-middleware = "0.3.0" reqwest-middleware = "0.2.0"
reqwest-retry = "0.5" reqwest-retry = "0.2.2"
routerify = "3" routerify = "3"
rpds = "0.13" rpds = "0.13"
rustc-hash = "1.1.0" rustc-hash = "1.1.0"
@@ -145,7 +140,7 @@ rustls-split = "0.3"
scopeguard = "1.1" scopeguard = "1.1"
sysinfo = "0.29.2" sysinfo = "0.29.2"
sd-notify = "0.4.1" sd-notify = "0.4.1"
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
serde_json = "1" serde_json = "1"
serde_path_to_error = "0.1" serde_path_to_error = "0.1"
@@ -159,12 +154,11 @@ socket2 = "0.5"
strum = "0.24" strum = "0.24"
strum_macros = "0.24" strum_macros = "0.24"
"subtle" = "2.5.0" "subtle" = "2.5.0"
# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet svg_fmt = "0.4.1"
svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
sync_wrapper = "0.1.2" sync_wrapper = "0.1.2"
tar = "0.4" tar = "0.4"
task-local-extensions = "0.1.4" task-local-extensions = "0.1.4"
test-context = "0.3" test-context = "0.1"
thiserror = "1.0" thiserror = "1.0"
tikv-jemallocator = "0.5" tikv-jemallocator = "0.5"
tikv-jemalloc-ctl = "0.5" tikv-jemalloc-ctl = "0.5"
@@ -179,11 +173,10 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
toml = "0.7" toml = "0.7"
toml_edit = "0.19" toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]} tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tower-service = "0.3.2"
tracing = "0.1" tracing = "0.1"
tracing-error = "0.2.0" tracing-error = "0.2.0"
tracing-opentelemetry = "0.21.0" tracing-opentelemetry = "0.20.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] } tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
twox-hash = { version = "1.6.3", default-features = false } twox-hash = { version = "1.6.3", default-features = false }
url = "2.2" url = "2.2"
urlencoding = "2.1" urlencoding = "2.1"
@@ -244,8 +237,8 @@ tonic-build = "0.9"
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
# bug fixes for UUID # bug fixes for UUID
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" } parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" } parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
################# Binary contents sections ################# Binary contents sections

View File

@@ -58,14 +58,8 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
&& mv protoc/include/google /usr/local/include/google \ && mv protoc/include/google /usr/local/include/google \
&& rm -rf protoc.zip protoc && rm -rf protoc.zip protoc
# s5cmd
ENV S5CMD_VERSION=2.2.2
RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
&& chmod +x s5cmd \
&& mv s5cmd /usr/local/bin/s5cmd
# LLVM # LLVM
ENV LLVM_VERSION=18 ENV LLVM_VERSION=17
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
&& apt update \ && apt update \
@@ -87,7 +81,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
&& rm awscliv2.zip && rm awscliv2.zip
# Mold: A Modern Linker # Mold: A Modern Linker
ENV MOLD_VERSION v2.31.0 ENV MOLD_VERSION v2.4.0
RUN set -e \ RUN set -e \
&& git clone https://github.com/rui314/mold.git \ && git clone https://github.com/rui314/mold.git \
&& mkdir mold/build \ && mkdir mold/build \
@@ -141,7 +135,7 @@ WORKDIR /home/nonroot
# Rust # Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.78.0 ENV RUSTC_VERSION=1.77.0
ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \

View File

@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
# Create remote extension download directory
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
# Install: # Install:
# libreadline8 for psql # libreadline8 for psql
# libicu67, locales for collations (including ICU and plpgsql_check) # libicu67, locales for collations (including ICU and plpgsql_check)

View File

@@ -25,16 +25,14 @@ ifeq ($(UNAME_S),Linux)
# Seccomp BPF is only available for Linux # Seccomp BPF is only available for Linux
PG_CONFIGURE_OPTS += --with-libseccomp PG_CONFIGURE_OPTS += --with-libseccomp
else ifeq ($(UNAME_S),Darwin) else ifeq ($(UNAME_S),Darwin)
ifndef DISABLE_HOMEBREW # macOS with brew-installed openssl requires explicit paths
# macOS with brew-installed openssl requires explicit paths # It can be configured with OPENSSL_PREFIX variable
# It can be configured with OPENSSL_PREFIX variable OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3) PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig # macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure # brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
endif
endif endif
# Use -C option so that when PostgreSQL "make install" installs the # Use -C option so that when PostgreSQL "make install" installs the
@@ -81,14 +79,11 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
exit 1; } exit 1; }
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
VERSION=$*; \ env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
CFLAGS='$(PG_CFLAGS)' \ CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ $(PG_CONFIGURE_OPTS) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log) --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
# nicer alias to run 'configure' # nicer alias to run 'configure'
# Note: I've been unable to use templates for this part of our configuration. # Note: I've been unable to use templates for this part of our configuration.

View File

@@ -27,12 +27,10 @@ reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true tokio-postgres.workspace = true
tokio-util.workspace = true tokio-util.workspace = true
tokio-stream.workspace = true
tracing.workspace = true tracing.workspace = true
tracing-opentelemetry.workspace = true tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true tracing-subscriber.workspace = true
tracing-utils.workspace = true tracing-utils.workspace = true
thiserror.workspace = true
url.workspace = true url.workspace = true
compute_api.workspace = true compute_api.workspace = true

View File

@@ -47,11 +47,10 @@ use chrono::Utc;
use clap::Arg; use clap::Arg;
use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals}; use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info, warn}; use tracing::{error, info};
use url::Url; use url::Url;
use compute_api::responses::ComputeStatus; use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{ use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID, forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -63,41 +62,12 @@ use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor; use compute_tools::monitor::launch_monitor;
use compute_tools::params::*; use compute_tools::params::*;
use compute_tools::spec::*; use compute_tools::spec::*;
use compute_tools::swap::resize_swap;
// this is an arbitrary build tag. Fine as a default / for testing purposes // this is an arbitrary build tag. Fine as a default / for testing purposes
// in-case of not-set environment var // in-case of not-set environment var
const BUILD_TAG_DEFAULT: &str = "latest"; const BUILD_TAG_DEFAULT: &str = "latest";
fn main() -> Result<()> { fn main() -> Result<()> {
let (build_tag, clap_args) = init()?;
let (pg_handle, start_pg_result) = {
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();
let cli_args = process_cli(&clap_args)?;
let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
start_postgres(&clap_args, wait_spec_result)?
// Startup is finished, exit the startup tracing span
};
// PostgreSQL is now running, if startup was successful. Wait until it exits.
let wait_pg_result = wait_postgres(pg_handle)?;
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
maybe_delay_exit(delay_exit);
deinit_and_exit(wait_pg_result);
}
fn init() -> Result<(String, clap::ArgMatches)> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -112,15 +82,9 @@ fn init() -> Result<(String, clap::ArgMatches)> {
.to_string(); .to_string();
info!("build_tag: {build_tag}"); info!("build_tag: {build_tag}");
Ok((build_tag, cli().get_matches())) let matches = cli().get_matches();
} let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
let pgbin_default = "postgres";
let pgbin = matches
.get_one::<String>("pgbin")
.map(|s| s.as_str())
.unwrap_or(pgbin_default);
let ext_remote_storage = matches let ext_remote_storage = matches
.get_one::<String>("remote-ext-config") .get_one::<String>("remote-ext-config")
@@ -146,32 +110,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
.expect("Postgres connection string is required"); .expect("Postgres connection string is required");
let spec_json = matches.get_one::<String>("spec"); let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path"); let spec_path = matches.get_one::<String>("spec-path");
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
Ok(ProcessCliResult {
connstr,
pgdata,
pgbin,
ext_remote_storage,
http_port,
spec_json,
spec_path,
resize_swap_on_bind,
})
}
struct ProcessCliResult<'clap> {
connstr: &'clap str,
pgdata: &'clap str,
pgbin: &'clap str,
ext_remote_storage: Option<&'clap str>,
http_port: u16,
spec_json: Option<&'clap String>,
spec_path: Option<&'clap String>,
resize_swap_on_bind: bool,
}
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
// Extract OpenTelemetry context for the startup actions from the // Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current // TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context. // tracing context.
@@ -208,7 +147,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
if let Ok(val) = std::env::var("TRACESTATE") { if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val); startup_tracing_carrier.insert("tracestate".to_string(), val);
} }
if !startup_tracing_carrier.is_empty() { let startup_context_guard = if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator; use opentelemetry::propagation::TextMapPropagator;
use opentelemetry::sdk::propagation::TraceContextPropagator; use opentelemetry::sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new() let guard = TraceContextPropagator::new()
@@ -218,17 +157,8 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
Some(guard) Some(guard)
} else { } else {
None None
} };
}
fn try_spec_from_cli(
matches: &clap::ArgMatches,
ProcessCliResult {
spec_json,
spec_path,
..
}: &ProcessCliResult,
) -> Result<CliSpecParams> {
let compute_id = matches.get_one::<String>("compute-id"); let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri"); let control_plane_uri = matches.get_one::<String>("control-plane-uri");
@@ -269,34 +199,6 @@ fn try_spec_from_cli(
} }
}; };
Ok(CliSpecParams {
spec,
live_config_allowed,
})
}
struct CliSpecParams {
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
live_config_allowed: bool,
}
fn wait_spec(
build_tag: String,
ProcessCliResult {
connstr,
pgdata,
pgbin,
ext_remote_storage,
resize_swap_on_bind,
http_port,
..
}: ProcessCliResult,
CliSpecParams {
spec,
live_config_allowed,
}: CliSpecParams,
) -> Result<WaitSpecResult> {
let mut new_state = ComputeState::new(); let mut new_state = ComputeState::new();
let spec_set; let spec_set;
@@ -324,17 +226,19 @@ fn wait_spec(
// If this is a pooled VM, prewarm before starting HTTP server and becoming // If this is a pooled VM, prewarm before starting HTTP server and becoming
// available for binding. Prewarming helps Postgres start quicker later, // available for binding. Prewarming helps Postgres start quicker later,
// because QEMU will already have its memory allocated from the host, and // because QEMU will already have it's memory allocated from the host, and
// the necessary binaries will already be cached. // the necessary binaries will already be cached.
if !spec_set { if !spec_set {
compute.prewarm_postgres()?; compute.prewarm_postgres()?;
} }
// Launch http service first, so that we can serve control-plane requests // Launch http service first, so we were able to serve control-plane
// while configuration is still in progress. // requests, while configuration is still in progress.
let _http_handle = let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let extension_server_port: u16 = http_port;
if !spec_set { if !spec_set {
// No spec provided, hang waiting for it. // No spec provided, hang waiting for it.
info!("no compute spec provided, waiting"); info!("no compute spec provided, waiting");
@@ -349,45 +253,21 @@ fn wait_spec(
break; break;
} }
} }
// Record for how long we slept waiting for the spec.
let now = Utc::now();
state.metrics.wait_for_spec_ms = now
.signed_duration_since(state.start_time)
.to_std()
.unwrap()
.as_millis() as u64;
// Reset start time, so that the total startup time that is calculated later will
// not include the time that we waited for the spec.
state.start_time = now;
} }
Ok(WaitSpecResult {
compute,
http_port,
resize_swap_on_bind,
})
}
struct WaitSpecResult {
compute: Arc<ComputeNode>,
// passed through from ProcessCliResult
http_port: u16,
resize_swap_on_bind: bool,
}
fn start_postgres(
// need to allow unused because `matches` is only used if target_os = "linux"
#[allow(unused_variables)] matches: &clap::ArgMatches,
WaitSpecResult {
compute,
http_port,
resize_swap_on_bind,
}: WaitSpecResult,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state. // We got all we need, update the state.
let mut state = compute.state.lock().unwrap(); let mut state = compute.state.lock().unwrap();
// Record for how long we slept waiting for the spec.
state.metrics.wait_for_spec_ms = Utc::now()
.signed_duration_since(state.start_time)
.to_std()
.unwrap()
.as_millis() as u64;
// Reset start time to the actual start of the configuration, so that
// total startup time was properly measured at the end.
state.start_time = Utc::now();
state.status = ComputeStatus::Init; state.status = ComputeStatus::Init;
compute.state_changed.notify_all(); compute.state_changed.notify_all();
@@ -395,72 +275,33 @@ fn start_postgres(
"running compute with features: {:?}", "running compute with features: {:?}",
state.pspec.as_ref().unwrap().spec.features state.pspec.as_ref().unwrap().spec.features
); );
// before we release the mutex, fetch the swap size (if any) for later.
let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
drop(state); drop(state);
// Launch remaining service threads // Launch remaining service threads
let _monitor_handle = launch_monitor(&compute); let _monitor_handle = launch_monitor(&compute);
let _configurator_handle = launch_configurator(&compute); let _configurator_handle = launch_configurator(&compute);
let mut prestartup_failed = false;
let mut delay_exit = false;
// Resize swap to the desired size if the compute spec says so
if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
// To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
// *before* starting postgres.
//
// In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
// carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
// OOM-killed during startup because swap wasn't available yet.
match resize_swap(size_bytes) {
Ok(()) => {
let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%size_bytes, %size_gib, "resized swap");
}
Err(err) => {
let err = err.context("failed to resize swap");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
let mut state = compute.state.lock().unwrap();
state.error = Some(format!("{err:?}"));
state.status = ComputeStatus::Failed;
compute.state_changed.notify_all();
delay_exit = true;
}
}
}
let extension_server_port: u16 = http_port;
// Start Postgres // Start Postgres
let mut pg = None; let mut delay_exit = false;
if !prestartup_failed { let mut exit_code = None;
pg = match compute.start_compute(extension_server_port) { let pg = match compute.start_compute(extension_server_port) {
Ok(pg) => Some(pg), Ok(pg) => Some(pg),
Err(err) => { Err(err) => {
error!("could not start the compute node: {:#}", err); error!("could not start the compute node: {:#}", err);
let mut state = compute.state.lock().unwrap(); let mut state = compute.state.lock().unwrap();
state.error = Some(format!("{:?}", err)); state.error = Some(format!("{:?}", err));
state.status = ComputeStatus::Failed; state.status = ComputeStatus::Failed;
// Notify others that Postgres failed to start. In case of configuring the // Notify others that Postgres failed to start. In case of configuring the
// empty compute, it's likely that API handler is still waiting for compute // empty compute, it's likely that API handler is still waiting for compute
// state change. With this we will notify it that compute is in Failed state, // state change. With this we will notify it that compute is in Failed state,
// so control plane will know about it earlier and record proper error instead // so control plane will know about it earlier and record proper error instead
// of timeout. // of timeout.
compute.state_changed.notify_all(); compute.state_changed.notify_all();
drop(state); // unlock drop(state); // unlock
delay_exit = true; delay_exit = true;
None None
} }
}; };
} else {
warn!("skipping postgres startup because pre-startup step failed");
}
// Start the vm-monitor if directed to. The vm-monitor only runs on linux // Start the vm-monitor if directed to. The vm-monitor only runs on linux
// because it requires cgroups. // because it requires cgroups.
@@ -493,7 +334,7 @@ fn start_postgres(
// This token is used internally by the monitor to clean up all threads // This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new(); let token = CancellationToken::new();
let vm_monitor = rt.as_ref().map(|rt| { let vm_monitor = &rt.as_ref().map(|rt| {
rt.spawn(vm_monitor::start( rt.spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args { Box::leak(Box::new(vm_monitor::Args {
cgroup: cgroup.cloned(), cgroup: cgroup.cloned(),
@@ -506,41 +347,12 @@ fn start_postgres(
} }
} }
Ok((
pg,
StartPostgresResult {
delay_exit,
compute,
#[cfg(target_os = "linux")]
rt,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
vm_monitor,
},
))
}
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
struct StartPostgresResult {
delay_exit: bool,
// passed through from WaitSpecResult
compute: Arc<ComputeNode>,
#[cfg(target_os = "linux")]
rt: Option<tokio::runtime::Runtime>,
#[cfg(target_os = "linux")]
token: tokio_util::sync::CancellationToken,
#[cfg(target_os = "linux")]
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
// Wait for the child Postgres process forever. In this state Ctrl+C will // Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well. // propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg { if let Some((mut pg, logs_handle)) = pg {
// Startup is finished, exit the startup tracing span
drop(startup_context_guard);
let ecode = pg let ecode = pg
.wait() .wait()
.expect("failed to start waiting on Postgres process"); .expect("failed to start waiting on Postgres process");
@@ -555,25 +367,6 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
exit_code = ecode.code() exit_code = ecode.code()
} }
Ok(WaitPostgresResult { exit_code })
}
struct WaitPostgresResult {
exit_code: Option<i32>,
}
fn cleanup_after_postgres_exit(
StartPostgresResult {
mut delay_exit,
compute,
#[cfg(target_os = "linux")]
vm_monitor,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
rt,
}: StartPostgresResult,
) -> Result<bool> {
// Terminate the vm_monitor so it releases the file watcher on // Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres. // /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups. // Note: the vm-monitor only runs on linux because it requires cgroups.
@@ -615,19 +408,13 @@ fn cleanup_after_postgres_exit(
error!("error while checking for core dumps: {err:?}"); error!("error while checking for core dumps: {err:?}");
} }
Ok(delay_exit)
}
fn maybe_delay_exit(delay_exit: bool) {
// If launch failed, keep serving HTTP requests for a while, so the cloud // If launch failed, keep serving HTTP requests for a while, so the cloud
// control plane can get the actual error. // control plane can get the actual error.
if delay_exit { if delay_exit {
info!("giving control plane 30s to collect the error before shutdown"); info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30)); thread::sleep(Duration::from_secs(30));
} }
}
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
// Shutdown trace pipeline gracefully, so that it has a chance to send any // Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may // pending traces before we exit. Shutting down OTEL tracing provider may
// hang for quite some time, see, for example: // hang for quite some time, see, for example:
@@ -739,11 +526,6 @@ fn cli() -> clap::Command {
) )
.value_name("FILECACHE_CONNSTR"), .value_name("FILECACHE_CONNSTR"),
) )
.arg(
Arg::new("resize-swap-on-bind")
.long("resize-swap-on-bind")
.action(clap::ArgAction::SetTrue),
)
} }
/// When compute_ctl is killed, send also termination signal to sync-safekeepers /// When compute_ctl is killed, send also termination signal to sync-safekeepers

View File

@@ -1,116 +0,0 @@
use compute_api::{
responses::CatalogObjects,
spec::{Database, Role},
};
use futures::Stream;
use postgres::{Client, NoTls};
use std::{path::Path, process::Stdio, result::Result, sync::Arc};
use tokio::{
io::{AsyncBufReadExt, BufReader},
process::Command,
task,
};
use tokio_stream::{self as stream, StreamExt};
use tokio_util::codec::{BytesCodec, FramedRead};
use tracing::warn;
use crate::{
compute::ComputeNode,
pg_helpers::{get_existing_dbs, get_existing_roles},
};
pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
let connstr = compute.connstr.clone();
task::spawn_blocking(move || {
let mut client = Client::connect(connstr.as_str(), NoTls)?;
let roles: Vec<Role>;
{
let mut xact = client.transaction()?;
roles = get_existing_roles(&mut xact)?;
}
let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
Ok(CatalogObjects { roles, databases })
})
.await?
}
#[derive(Debug, thiserror::Error)]
pub enum SchemaDumpError {
#[error("Database does not exist.")]
DatabaseDoesNotExist,
#[error("Failed to execute pg_dump.")]
IO(#[from] std::io::Error),
}
// It uses the pg_dump utility to dump the schema of the specified database.
// The output is streamed back to the caller and supposed to be streamed via HTTP.
//
// Before return the result with the output, it checks that pg_dump produced any output.
// If not, it tries to parse the stderr output to determine if the database does not exist
// and special error is returned.
//
// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
pub async fn get_database_schema(
compute: &Arc<ComputeNode>,
dbname: &str,
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
let pgbin = &compute.pgbin;
let basepath = Path::new(pgbin).parent().unwrap();
let pgdump = basepath.join("pg_dump");
let mut connstr = compute.connstr.clone();
connstr.set_path(dbname);
let mut cmd = Command::new(pgdump)
.arg("--schema-only")
.arg(connstr.as_str())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.kill_on_drop(true)
.spawn()?;
let stdout = cmd.stdout.take().ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
})?;
let stderr = cmd.stderr.take().ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
})?;
let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
let stderr_reader = BufReader::new(stderr);
let first_chunk = match stdout_reader.next().await {
Some(Ok(bytes)) if !bytes.is_empty() => bytes,
Some(Err(e)) => {
return Err(SchemaDumpError::IO(e));
}
_ => {
let mut lines = stderr_reader.lines();
if let Some(line) = lines.next_line().await? {
if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) {
return Err(SchemaDumpError::DatabaseDoesNotExist);
}
warn!("pg_dump stderr: {}", line)
}
tokio::spawn(async move {
while let Ok(Some(line)) = lines.next_line().await {
warn!("pg_dump stderr: {}", line)
}
});
return Err(SchemaDumpError::IO(std::io::Error::new(
std::io::ErrorKind::Other,
"failed to start pg_dump",
)));
}
};
let initial_stream = stream::once(Ok(first_chunk.freeze()));
// Consume stderr and log warnings
tokio::spawn(async move {
let mut lines = stderr_reader.lines();
while let Ok(Some(line)) = lines.next_line().await {
warn!("pg_dump stderr: {}", line)
}
});
Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
}

View File

@@ -818,15 +818,9 @@ impl ComputeNode {
Client::connect(zenith_admin_connstr.as_str(), NoTls) Client::connect(zenith_admin_connstr.as_str(), NoTls)
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
// Disable forwarding so that users don't get a cloud_admin role // Disable forwarding so that users don't get a cloud_admin role
client.simple_query("SET neon.forward_ddl = false")?;
let mut func = || { client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("SET neon.forward_ddl = false")?; client.simple_query("GRANT zenith_admin TO cloud_admin")?;
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
Ok::<_, anyhow::Error>(())
};
func().context("apply_config setup cloud_admin")?;
drop(client); drop(client);
// reconnect with connstring with expected name // reconnect with connstring with expected name
@@ -838,29 +832,24 @@ impl ComputeNode {
}; };
// Disable DDL forwarding because control plane already knows about these roles/databases. // Disable DDL forwarding because control plane already knows about these roles/databases.
client client.simple_query("SET neon.forward_ddl = false")?;
.simple_query("SET neon.forward_ddl = false")
.context("apply_config SET neon.forward_ddl = false")?;
// Proceed with post-startup configuration. Note, that order of operations is important. // Proceed with post-startup configuration. Note, that order of operations is important.
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?; create_neon_superuser(spec, &mut client)?;
cleanup_instance(&mut client).context("apply_config cleanup_instance")?; cleanup_instance(&mut client)?;
handle_roles(spec, &mut client).context("apply_config handle_roles")?; handle_roles(spec, &mut client)?;
handle_databases(spec, &mut client).context("apply_config handle_databases")?; handle_databases(spec, &mut client)?;
handle_role_deletions(spec, connstr.as_str(), &mut client) handle_role_deletions(spec, connstr.as_str(), &mut client)?;
.context("apply_config handle_role_deletions")?;
handle_grants( handle_grants(
spec, spec,
&mut client, &mut client,
connstr.as_str(), connstr.as_str(),
self.has_feature(ComputeFeature::AnonExtension), self.has_feature(ComputeFeature::AnonExtension),
) )?;
.context("apply_config handle_grants")?; handle_extensions(spec, &mut client)?;
handle_extensions(spec, &mut client).context("apply_config handle_extensions")?; handle_extension_neon(&mut client)?;
handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?; create_availability_check_data(&mut client)?;
create_availability_check_data(&mut client)
.context("apply_config create_availability_check_data")?;
// 'Close' connection // 'Close' connection
drop(client); drop(client);
@@ -868,7 +857,7 @@ impl ComputeNode {
// Run migrations separately to not hold up cold starts // Run migrations separately to not hold up cold starts
thread::spawn(move || { thread::spawn(move || {
let mut client = Client::connect(connstr.as_str(), NoTls)?; let mut client = Client::connect(connstr.as_str(), NoTls)?;
handle_migrations(&mut client).context("apply_config handle_migrations") handle_migrations(&mut client)
}); });
Ok(()) Ok(())
} }
@@ -1273,12 +1262,10 @@ LIMIT 100",
.await .await
.map_err(DownloadError::Other); .map_err(DownloadError::Other);
if download_size.is_ok() { self.ext_download_progress
self.ext_download_progress .write()
.write() .expect("bad lock")
.expect("bad lock") .insert(ext_archive_name.to_string(), (download_start, true));
.insert(ext_archive_name.to_string(), (download_start, true));
}
download_size download_size
} }

View File

@@ -6,8 +6,8 @@ use std::path::Path;
use anyhow::Result; use anyhow::Result;
use crate::pg_helpers::escape_conf_value; use crate::pg_helpers::escape_conf_value;
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; use crate::pg_helpers::PgOptionsSerialize;
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; use compute_api::spec::{ComputeMode, ComputeSpec};
/// Check that `line` is inside a text file and put it there if it is not. /// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist. /// Create file if it doesn't exist.
@@ -92,27 +92,6 @@ pub fn write_postgres_conf(
} }
} }
if cfg!(target_os = "linux") {
// Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
// disabled), then the control plane has enabled swap and we should set
// dynamic_shared_memory_type = 'mmap'.
//
// This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
// ignore any errors - they may be expected to occur under certain situations (e.g. when
// not running in Linux).
.unwrap_or_else(|_| String::new());
if overcommit_memory_contents.trim() == "2" {
let opt = GenericOption {
name: "dynamic_shared_memory_type".to_owned(),
value: Some("mmap".to_owned()),
vartype: "enum".to_owned(),
};
write!(file, "{}", opt.to_pg_setting())?;
}
}
// If there are any extra options in the 'settings' field, append those // If there are any extra options in the 'settings' field, append those
if spec.cluster.settings.is_some() { if spec.cluster.settings.is_some() {
writeln!(file, "# Managed by compute_ctl: begin")?; writeln!(file, "# Managed by compute_ctl: begin")?;

View File

@@ -5,21 +5,17 @@ use std::net::SocketAddr;
use std::sync::Arc; use std::sync::Arc;
use std::thread; use std::thread;
use crate::catalog::SchemaDumpError;
use crate::catalog::{get_database_schema, get_dbs_and_roles};
use crate::compute::forward_termination_signal; use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_api::requests::ConfigurationRequest; use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
use anyhow::Result; use anyhow::Result;
use hyper::header::CONTENT_TYPE;
use hyper::service::{make_service_fn, service_fn}; use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode}; use hyper::{Body, Method, Request, Response, Server, StatusCode};
use tokio::task; use tokio::task;
use tracing::{error, info, warn}; use tracing::{error, info, warn};
use tracing_utils::http::OtelName; use tracing_utils::http::OtelName;
use utils::http::request::must_get_query_param;
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
ComputeStatusResponse { ComputeStatusResponse {
@@ -137,34 +133,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
} }
} }
(&Method::GET, "/dbs_and_roles") => {
info!("serving /dbs_and_roles GET request",);
match get_dbs_and_roles(compute).await {
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
Err(_) => {
render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
(&Method::GET, "/database_schema") => {
let database = match must_get_query_param(&req, "database") {
Err(e) => return e.into_response(),
Ok(database) => database,
};
info!("serving /database_schema GET request with database: {database}",);
match get_database_schema(compute, &database).await {
Ok(res) => render_plain(Body::wrap_stream(res)),
Err(SchemaDumpError::DatabaseDoesNotExist) => {
render_json_error("database does not exist", StatusCode::NOT_FOUND)
}
Err(e) => {
error!("can't get schema dump: {}", e);
render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// download extension files from remote extension storage on demand // download extension files from remote extension storage on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => { (&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route); info!("serving {:?} POST request", route);
@@ -335,25 +303,10 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
}; };
Response::builder() Response::builder()
.status(status) .status(status)
.header(CONTENT_TYPE, "application/json")
.body(Body::from(serde_json::to_string(&error).unwrap())) .body(Body::from(serde_json::to_string(&error).unwrap()))
.unwrap() .unwrap()
} }
fn render_json(body: Body) -> Response<Body> {
Response::builder()
.header(CONTENT_TYPE, "application/json")
.body(body)
.unwrap()
}
fn render_plain(body: Body) -> Response<Body> {
Response::builder()
.header(CONTENT_TYPE, "text/plain")
.body(body)
.unwrap()
}
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> { async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
{ {
let mut state = compute.state.lock().unwrap(); let mut state = compute.state.lock().unwrap();

View File

@@ -68,51 +68,6 @@ paths:
schema: schema:
$ref: "#/components/schemas/Info" $ref: "#/components/schemas/Info"
/dbs_and_roles:
get:
tags:
- Info
summary: Get databases and roles in the catalog.
description: ""
operationId: getDbsAndRoles
responses:
200:
description: Compute schema objects
content:
application/json:
schema:
$ref: "#/components/schemas/DbsAndRoles"
/database_schema:
get:
tags:
- Info
summary: Get schema dump
parameters:
- name: database
in: query
description: Database name to dump.
required: true
schema:
type: string
example: "postgres"
description: Get schema dump in SQL format.
operationId: getDatabaseSchema
responses:
200:
description: Schema dump
content:
text/plain:
schema:
type: string
description: Schema dump in SQL format.
404:
description: Non existing database.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
/check_writability: /check_writability:
post: post:
tags: tags:
@@ -274,73 +229,6 @@ components:
num_cpus: num_cpus:
type: integer type: integer
DbsAndRoles:
type: object
description: Databases and Roles
required:
- roles
- databases
properties:
roles:
type: array
items:
$ref: "#/components/schemas/Role"
databases:
type: array
items:
$ref: "#/components/schemas/Database"
Database:
type: object
description: Database
required:
- name
- owner
- restrict_conn
- invalid
properties:
name:
type: string
owner:
type: string
options:
type: array
items:
$ref: "#/components/schemas/GenericOption"
restrict_conn:
type: boolean
invalid:
type: boolean
Role:
type: object
description: Role
required:
- name
properties:
name:
type: string
encrypted_password:
type: string
options:
type: array
items:
$ref: "#/components/schemas/GenericOption"
GenericOption:
type: object
description: Schema Generic option
required:
- name
- vartype
properties:
name:
type: string
value:
type: string
vartype:
type: string
ComputeState: ComputeState:
type: object type: object
required: required:

View File

@@ -8,12 +8,10 @@ pub mod configurator;
pub mod http; pub mod http;
#[macro_use] #[macro_use]
pub mod logger; pub mod logger;
pub mod catalog;
pub mod compute; pub mod compute;
pub mod extension_server; pub mod extension_server;
pub mod monitor; pub mod monitor;
pub mod params; pub mod params;
pub mod pg_helpers; pub mod pg_helpers;
pub mod spec; pub mod spec;
pub mod swap;
pub mod sync_sk; pub mod sync_sk;

View File

@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
format!("'{}'", res) format!("'{}'", res)
} }
pub trait GenericOptionExt { trait GenericOptionExt {
fn to_pg_option(&self) -> String; fn to_pg_option(&self) -> String;
fn to_pg_setting(&self) -> String; fn to_pg_setting(&self) -> String;
} }

View File

@@ -2,7 +2,7 @@ use std::fs::File;
use std::path::Path; use std::path::Path;
use std::str::FromStr; use std::str::FromStr;
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{anyhow, bail, Result};
use postgres::config::Config; use postgres::config::Config;
use postgres::{Client, NoTls}; use postgres::{Client, NoTls};
use reqwest::StatusCode; use reqwest::StatusCode;
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
RoleAction::Create => { RoleAction::Create => {
// This branch only runs when roles are created through the console, so it is // This branch only runs when roles are created through the console, so it is
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
// from neon_superuser. // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
let mut query: String = format!( let mut query: String = format!(
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser", "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
name.pg_quote() name.pg_quote()
); );
info!("running role create query: '{}'", &query); info!("running role create query: '{}'", &query);
@@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
"rename_db" => { "rename_db" => {
let new_name = op.new_name.as_ref().unwrap(); let new_name = op.new_name.as_ref().unwrap();
if existing_dbs.contains_key(&op.name) { if existing_dbs.get(&op.name).is_some() {
let query: String = format!( let query: String = format!(
"ALTER DATABASE {} RENAME TO {}", "ALTER DATABASE {} RENAME TO {}",
op.name.pg_quote(), op.name.pg_quote(),
@@ -698,8 +698,7 @@ pub fn handle_grants(
// it is important to run this after all grants // it is important to run this after all grants
if enable_anon_extension { if enable_anon_extension {
handle_extension_anon(spec, &db.owner, &mut db_client, false) handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
.context("handle_grants handle_extension_anon")?;
} }
} }
@@ -744,24 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
// which may happen in two cases: // which may happen in two cases:
// - extension was just installed // - extension was just installed
// - extension was already installed and is up to date // - extension was already installed and is up to date
let query = "ALTER EXTENSION neon UPDATE"; // DISABLED due to compute node unpinning epic
info!("update neon extension version with query: {}", query); // let query = "ALTER EXTENSION neon UPDATE";
if let Err(e) = client.simple_query(query) { // info!("update neon extension version with query: {}", query);
error!( // client.simple_query(query)?;
"failed to upgrade neon extension during `handle_extension_neon`: {}",
e
);
}
Ok(()) Ok(())
} }
#[instrument(skip_all)] #[instrument(skip_all)]
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
info!("handle neon extension upgrade"); info!("handle neon extension upgrade (not really)");
let query = "ALTER EXTENSION neon UPDATE"; // DISABLED due to compute node unpinning epic
info!("update neon extension version with query: {}", query); // let query = "ALTER EXTENSION neon UPDATE";
client.simple_query(query)?; // info!("update neon extension version with query: {}", query);
// client.simple_query(query)?;
Ok(()) Ok(())
} }
@@ -810,40 +806,43 @@ $$;"#,
"", "",
"", "",
"", "",
"",
// Add new migrations below. // Add new migrations below.
r#"
DO $$
DECLARE
role_name TEXT;
BEGIN
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
END LOOP;
END
$$;"#,
]; ];
let mut func = || { let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; client.simple_query(query)?;
client.simple_query(query)?;
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
client.simple_query(query)?; client.simple_query(query)?;
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
client.simple_query(query)?; client.simple_query(query)?;
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
client.simple_query(query)?; client.simple_query(query)?;
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
client.simple_query(query)?; client.simple_query(query)?;
Ok::<_, anyhow::Error>(())
};
func().context("handle_migrations prepare")?;
let query = "SELECT id FROM neon_migration.migration_id"; query = "SELECT id FROM neon_migration.migration_id";
let row = client let row = client.query_one(query, &[])?;
.query_one(query, &[])
.context("handle_migrations get migration_id")?;
let mut current_migration: usize = row.get::<&str, i64>("id") as usize; let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
let starting_migration_id = current_migration; let starting_migration_id = current_migration;
let query = "BEGIN"; query = "BEGIN";
client client.simple_query(query)?;
.simple_query(query)
.context("handle_migrations begin")?;
while current_migration < migrations.len() { while current_migration < migrations.len() {
let migration = &migrations[current_migration]; let migration = &migrations[current_migration];
@@ -851,9 +850,7 @@ $$;"#,
info!("Skip migration id={}", current_migration); info!("Skip migration id={}", current_migration);
} else { } else {
info!("Running migration:\n{}\n", migration); info!("Running migration:\n{}\n", migration);
client.simple_query(migration).with_context(|| { client.simple_query(migration)?;
format!("handle_migrations current_migration={}", current_migration)
})?;
} }
current_migration += 1; current_migration += 1;
} }
@@ -861,14 +858,10 @@ $$;"#,
"UPDATE neon_migration.migration_id SET id={}", "UPDATE neon_migration.migration_id SET id={}",
migrations.len() migrations.len()
); );
client client.simple_query(&setval)?;
.simple_query(&setval)
.context("handle_migrations update id")?;
let query = "COMMIT"; query = "COMMIT";
client client.simple_query(query)?;
.simple_query(query)
.context("handle_migrations commit")?;
info!( info!(
"Ran {} migrations", "Ran {} migrations",

View File

@@ -1,36 +0,0 @@
use anyhow::{anyhow, Context};
use tracing::warn;
pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
// run `/neonvm/bin/resize-swap --once {size_bytes}`
//
// Passing '--once' causes resize-swap to delete itself after successful completion, which
// means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
// postgres is running.
//
// NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
let child_result = std::process::Command::new("/usr/bin/sudo")
.arg(RESIZE_SWAP_BIN)
.arg("--once")
.arg(size_bytes.to_string())
.spawn();
if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
return Ok(());
}
child_result
.context("spawn() failed")
.and_then(|mut child| child.wait().context("wait() failed"))
.and_then(|status| match status.success() {
true => Ok(()),
false => Err(anyhow!("process exited with {status}")),
})
// wrap any prior error with the overall context that we couldn't run the command
.with_context(|| {
format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
})
}

View File

@@ -17,7 +17,6 @@ nix.workspace = true
once_cell.workspace = true once_cell.workspace = true
postgres.workspace = true postgres.workspace = true
hex.workspace = true hex.workspace = true
humantime-serde.workspace = true
hyper.workspace = true hyper.workspace = true
regex.workspace = true regex.workspace = true
reqwest = { workspace = true, features = ["blocking", "json"] } reqwest = { workspace = true, features = ["blocking", "json"] }
@@ -28,7 +27,6 @@ serde_with.workspace = true
tar.workspace = true tar.workspace = true
thiserror.workspace = true thiserror.workspace = true
toml.workspace = true toml.workspace = true
toml_edit.workspace = true
tokio.workspace = true tokio.workspace = true
tokio-postgres.workspace = true tokio-postgres.workspace = true
tokio-util.workspace = true tokio-util.workspace = true

View File

@@ -1,5 +1,5 @@
[package] [package]
name = "storage_controller" name = "attachment_service"
version = "0.1.0" version = "0.1.0"
edition.workspace = true edition.workspace = true
license.workspace = true license.workspace = true
@@ -25,13 +25,12 @@ git-version.workspace = true
hex.workspace = true hex.workspace = true
hyper.workspace = true hyper.workspace = true
humantime.workspace = true humantime.workspace = true
itertools.workspace = true
lasso.workspace = true lasso.workspace = true
once_cell.workspace = true once_cell.workspace = true
pageserver_api.workspace = true pageserver_api.workspace = true
pageserver_client.workspace = true pageserver_client.workspace = true
postgres_connection.workspace = true postgres_connection.workspace = true
reqwest = { workspace = true, features = ["stream"] } reqwest.workspace = true
routerify.workspace = true routerify.workspace = true
serde.workspace = true serde.workspace = true
serde_json.workspace = true serde_json.workspace = true
@@ -40,15 +39,13 @@ tokio.workspace = true
tokio-util.workspace = true tokio-util.workspace = true
tracing.workspace = true tracing.workspace = true
measured.workspace = true measured.workspace = true
strum.workspace = true
strum_macros.workspace = true
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
diesel_migrations = { version = "2.1.0" } diesel_migrations = { version = "2.1.0" }
r2d2 = { version = "0.8.10" } r2d2 = { version = "0.8.10" }
utils = { path = "../libs/utils/" } utils = { path = "../../libs/utils/" }
metrics = { path = "../libs/metrics/" } metrics = { path = "../../libs/metrics/" }
control_plane = { path = "../control_plane" } control_plane = { path = ".." }
workspace_hack = { version = "0.1", path = "../workspace_hack" } workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -0,0 +1,462 @@
use std::{collections::HashMap, time::Duration};
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
use control_plane::local_env::LocalEnv;
use hyper::{Method, StatusCode};
use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
use postgres_connection::parse_host_port;
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use utils::{
backoff::{self},
id::{NodeId, TenantId},
};
use crate::service::Config;
const BUSY_DELAY: Duration = Duration::from_secs(1);
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
pub(crate) const API_CONCURRENCY: usize = 32;
struct ShardedComputeHookTenant {
stripe_size: ShardStripeSize,
shard_count: ShardCount,
shards: Vec<(ShardNumber, NodeId)>,
}
enum ComputeHookTenant {
Unsharded(NodeId),
Sharded(ShardedComputeHookTenant),
}
impl ComputeHookTenant {
/// Construct with at least one shard's information
fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self {
if tenant_shard_id.shard_count.count() > 1 {
Self::Sharded(ShardedComputeHookTenant {
shards: vec![(tenant_shard_id.shard_number, node_id)],
stripe_size,
shard_count: tenant_shard_id.shard_count,
})
} else {
Self::Unsharded(node_id)
}
}
/// Set one shard's location. If stripe size or shard count have changed, Self is reset
/// and drops existing content.
fn update(
&mut self,
tenant_shard_id: TenantShardId,
stripe_size: ShardStripeSize,
node_id: NodeId,
) {
match self {
Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
*existing_node_id = node_id
}
Self::Sharded(sharded_tenant)
if sharded_tenant.stripe_size == stripe_size
&& sharded_tenant.shard_count == tenant_shard_id.shard_count =>
{
if let Some(existing) = sharded_tenant
.shards
.iter()
.position(|s| s.0 == tenant_shard_id.shard_number)
{
sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id;
} else {
sharded_tenant
.shards
.push((tenant_shard_id.shard_number, node_id));
sharded_tenant.shards.sort_by_key(|s| s.0)
}
}
_ => {
// Shard count changed: reset struct.
*self = Self::new(tenant_shard_id, stripe_size, node_id);
}
}
}
}
#[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequestShard {
node_id: NodeId,
shard_number: ShardNumber,
}
/// Request body that we send to the control plane to notify it of where a tenant is attached
#[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequest {
tenant_id: TenantId,
stripe_size: Option<ShardStripeSize>,
shards: Vec<ComputeHookNotifyRequestShard>,
}
/// Error type for attempts to call into the control plane compute notification hook
#[derive(thiserror::Error, Debug)]
pub(crate) enum NotifyError {
// Request was not send successfully, e.g. transport error
#[error("Sending request: {0}")]
Request(#[from] reqwest::Error),
// Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon.
#[error("Control plane tenant busy")]
Busy,
// Explicit 429 response asking us to retry less frequently
#[error("Control plane overloaded")]
SlowDown,
// A 503 response indicates the control plane can't handle the request right now
#[error("Control plane unavailable (status {0})")]
Unavailable(StatusCode),
// API returned unexpected non-success status. We will retry, but log a warning.
#[error("Control plane returned unexpected status {0}")]
Unexpected(StatusCode),
// We shutdown while sending
#[error("Shutting down")]
ShuttingDown,
// A response indicates we will never succeed, such as 400 or 404
#[error("Non-retryable error {0}")]
Fatal(StatusCode),
}
impl ComputeHookTenant {
fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
match self {
Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
tenant_id,
shards: vec![ComputeHookNotifyRequestShard {
shard_number: ShardNumber(0),
node_id: *node_id,
}],
stripe_size: None,
}),
Self::Sharded(sharded_tenant)
if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
{
Some(ComputeHookNotifyRequest {
tenant_id,
shards: sharded_tenant
.shards
.iter()
.map(|(shard_number, node_id)| ComputeHookNotifyRequestShard {
shard_number: *shard_number,
node_id: *node_id,
})
.collect(),
stripe_size: Some(sharded_tenant.stripe_size),
})
}
Self::Sharded(sharded_tenant) => {
// Sharded tenant doesn't yet have information for all its shards
tracing::info!(
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
sharded_tenant.shards.len(),
sharded_tenant.shard_count.count()
);
None
}
}
}
}
/// The compute hook is a destination for notifications about changes to tenant:pageserver
/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures
/// the compute connection string.
pub(super) struct ComputeHook {
config: Config,
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
authorization_header: Option<String>,
}
impl ComputeHook {
pub(super) fn new(config: Config) -> Self {
let authorization_header = config
.control_plane_jwt_token
.clone()
.map(|jwt| format!("Bearer {}", jwt));
Self {
state: Default::default(),
config,
authorization_header,
}
}
/// For test environments: use neon_local's LocalEnv to update compute
async fn do_notify_local(
&self,
reconfigure_request: ComputeHookNotifyRequest,
) -> anyhow::Result<()> {
let env = match LocalEnv::load_config() {
Ok(e) => e,
Err(e) => {
tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})");
return Ok(());
}
};
let cplane =
ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
let ComputeHookNotifyRequest {
tenant_id,
shards,
stripe_size,
} = reconfigure_request;
let compute_pageservers = shards
.into_iter()
.map(|shard| {
let ps_conf = env
.get_pageserver_conf(shard.node_id)
.expect("Unknown pageserver");
let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
.expect("Unable to parse listen_pg_addr");
(pg_host, pg_port.unwrap_or(5432))
})
.collect::<Vec<_>>();
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
endpoint
.reconfigure(compute_pageservers.clone(), stripe_size)
.await?;
}
}
Ok(())
}
async fn do_notify_iteration(
&self,
client: &reqwest::Client,
url: &String,
reconfigure_request: &ComputeHookNotifyRequest,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let req = client.request(Method::PUT, url);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
};
tracing::info!(
"Sending notify request to {} ({:?})",
url,
reconfigure_request
);
let send_result = req.json(&reconfigure_request).send().await;
let response = match send_result {
Ok(r) => r,
Err(e) => return Err(e.into()),
};
// Treat all 2xx responses as success
if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES {
if response.status() != StatusCode::OK {
// Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so
// log a warning.
tracing::warn!(
"Unexpected 2xx response code {} from control plane",
response.status()
);
}
return Ok(());
}
// Error response codes
match response.status() {
StatusCode::TOO_MANY_REQUESTS => {
// TODO: 429 handling should be global: set some state visible to other requests
// so that they will delay before starting, rather than all notifications trying
// once before backing off.
tokio::time::timeout(SLOWDOWN_DELAY, cancel.cancelled())
.await
.ok();
Err(NotifyError::SlowDown)
}
StatusCode::LOCKED => {
// Delay our retry if busy: the usual fast exponential backoff in backoff::retry
// is not appropriate
tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
.await
.ok();
Err(NotifyError::Busy)
}
StatusCode::SERVICE_UNAVAILABLE
| StatusCode::GATEWAY_TIMEOUT
| StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())),
StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
Err(NotifyError::Fatal(response.status()))
}
_ => Err(NotifyError::Unexpected(response.status())),
}
}
async fn do_notify(
&self,
url: &String,
reconfigure_request: ComputeHookNotifyRequest,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let client = reqwest::Client::new();
backoff::retry(
|| self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
3,
10,
"Send compute notification",
cancel,
)
.await
.ok_or_else(|| NotifyError::ShuttingDown)
.and_then(|x| x)
}
/// Call this to notify the compute (postgres) tier of new pageservers to use
/// for a tenant. notify() is called by each shard individually, and this function
/// will decide whether an update to the tenant is sent. An update is sent on the
/// condition that:
/// - We know a pageserver for every shard.
/// - All the shards have the same shard_count (i.e. we are not mid-split)
///
/// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
/// that is cancelled.
///
/// This function is fallible, including in the case that the control plane is transiently
/// unavailable. A limited number of retries are done internally to efficiently hide short unavailability
/// periods, but we don't retry forever. The **caller** is responsible for handling failures and
/// ensuring that they eventually call again to ensure that the compute is eventually notified of
/// the proper pageserver nodes for a tenant.
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
pub(super) async fn notify(
&self,
tenant_shard_id: TenantShardId,
node_id: NodeId,
stripe_size: ShardStripeSize,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let mut locked = self.state.lock().await;
use std::collections::hash_map::Entry;
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
tenant_shard_id,
stripe_size,
node_id,
)),
Entry::Occupied(e) => {
let tenant = e.into_mut();
tenant.update(tenant_shard_id, stripe_size, node_id);
tenant
}
};
let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
let Some(reconfigure_request) = reconfigure_request else {
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
// until it does.
tracing::info!("Tenant isn't yet ready to emit a notification");
return Ok(());
};
if let Some(notify_url) = &self.config.compute_hook_url {
self.do_notify(notify_url, reconfigure_request, cancel)
.await
} else {
self.do_notify_local(reconfigure_request)
.await
.map_err(|e| {
// This path is for testing only, so munge the error into our prod-style error type.
tracing::error!("Local notification hook failed: {e}");
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
}
}
}
#[cfg(test)]
pub(crate) mod tests {
use pageserver_api::shard::{ShardCount, ShardNumber};
use utils::id::TenantId;
use super::*;
#[test]
fn tenant_updates() -> anyhow::Result<()> {
let tenant_id = TenantId::generate();
let mut tenant_state = ComputeHookTenant::new(
TenantShardId {
tenant_id,
shard_count: ShardCount::new(0),
shard_number: ShardNumber(0),
},
ShardStripeSize(12345),
NodeId(1),
);
// An unsharded tenant is always ready to emit a notification
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.shards
.len(),
1
);
assert!(tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size
.is_none());
// Writing the first shard of a multi-sharded situation (i.e. in a split)
// resets the tenant state and puts it in an non-notifying state (need to
// see all shards)
tenant_state.update(
TenantShardId {
tenant_id,
shard_count: ShardCount::new(2),
shard_number: ShardNumber(1),
},
ShardStripeSize(32768),
NodeId(1),
);
assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
// Writing the second shard makes it ready to notify
tenant_state.update(
TenantShardId {
tenant_id,
shard_count: ShardCount::new(2),
shard_number: ShardNumber(0),
},
ShardStripeSize(32768),
NodeId(1),
);
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.shards
.len(),
2
);
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size,
Some(ShardStripeSize(32768))
);
Ok(())
}
}

View File

@@ -184,19 +184,6 @@ impl HeartbeaterTask {
} }
} }
} }
tracing::info!(
"Heartbeat round complete for {} nodes, {} offline",
new_state.len(),
new_state
.values()
.filter(|s| match s {
PageserverState::Available { .. } => {
false
}
PageserverState::Offline => true,
})
.count()
);
let mut deltas = Vec::new(); let mut deltas = Vec::new();
let now = Instant::now(); let now = Instant::now();

View File

@@ -4,12 +4,10 @@ use crate::metrics::{
}; };
use crate::reconciler::ReconcileError; use crate::reconciler::ReconcileError;
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
use anyhow::Context;
use futures::Future; use futures::Future;
use hyper::header::CONTENT_TYPE; use hyper::header::CONTENT_TYPE;
use hyper::{Body, Request, Response}; use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri}; use hyper::{StatusCode, Uri};
use metrics::{BuildInfo, NeonMetrics};
use pageserver_api::models::{ use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest, TenantTimeTravelRequest, TimelineCreateRequest,
@@ -36,8 +34,7 @@ use utils::{
}; };
use pageserver_api::controller_api::{ use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
TenantShardMigrateRequest,
}; };
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
@@ -46,19 +43,15 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
use routerify::Middleware; use routerify::Middleware;
/// State available to HTTP request handlers /// State available to HTTP request handlers
#[derive(Clone)]
pub struct HttpState { pub struct HttpState {
service: Arc<crate::service::Service>, service: Arc<crate::service::Service>,
auth: Option<Arc<SwappableJwtAuth>>, auth: Option<Arc<SwappableJwtAuth>>,
neon_metrics: NeonMetrics,
allowlist_routes: Vec<Uri>, allowlist_routes: Vec<Uri>,
} }
impl HttpState { impl HttpState {
pub fn new( pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
service: Arc<crate::service::Service>,
auth: Option<Arc<SwappableJwtAuth>>,
build_info: BuildInfo,
) -> Self {
let allowlist_routes = ["/status", "/ready", "/metrics"] let allowlist_routes = ["/status", "/ready", "/metrics"]
.iter() .iter()
.map(|v| v.parse().unwrap()) .map(|v| v.parse().unwrap())
@@ -66,7 +59,6 @@ impl HttpState {
Self { Self {
service, service,
auth, auth,
neon_metrics: NeonMetrics::new(build_info),
allowlist_routes, allowlist_routes,
} }
} }
@@ -259,12 +251,6 @@ async fn handle_tenant_time_travel_remote_storage(
json_response(StatusCode::OK, ()) json_response(StatusCode::OK, ())
} }
fn map_reqwest_hyper_status(status: reqwest::StatusCode) -> Result<hyper::StatusCode, ApiError> {
hyper::StatusCode::from_u16(status.as_u16())
.context("invalid status code")
.map_err(ApiError::InternalServerError)
}
async fn handle_tenant_secondary_download( async fn handle_tenant_secondary_download(
service: Arc<Service>, service: Arc<Service>,
req: Request<Body>, req: Request<Body>,
@@ -273,7 +259,7 @@ async fn handle_tenant_secondary_download(
let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?; let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
json_response(map_reqwest_hyper_status(status)?, progress) json_response(status, progress)
} }
async fn handle_tenant_delete( async fn handle_tenant_delete(
@@ -284,10 +270,7 @@ async fn handle_tenant_delete(
check_permissions(&req, Scope::PageServerApi)?; check_permissions(&req, Scope::PageServerApi)?;
deletion_wrapper(service, move |service| async move { deletion_wrapper(service, move |service| async move {
service service.tenant_delete(tenant_id).await
.tenant_delete(tenant_id)
.await
.and_then(map_reqwest_hyper_status)
}) })
.await .await
} }
@@ -318,10 +301,7 @@ async fn handle_tenant_timeline_delete(
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
deletion_wrapper(service, move |service| async move { deletion_wrapper(service, move |service| async move {
service service.tenant_timeline_delete(tenant_id, timeline_id).await
.tenant_timeline_delete(tenant_id, timeline_id)
.await
.and_then(map_reqwest_hyper_status)
}) })
.await .await
} }
@@ -384,9 +364,11 @@ async fn handle_tenant_timeline_passthrough(
} }
// We have a reqest::Response, would like a http::Response // We have a reqest::Response, would like a http::Response
let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?); let mut builder = hyper::Response::builder()
.status(resp.status())
.version(resp.version());
for (k, v) in resp.headers() { for (k, v) in resp.headers() {
builder = builder.header(k.as_str(), v.as_bytes()); builder = builder.header(k, v);
} }
let response = builder let response = builder
@@ -416,15 +398,6 @@ async fn handle_tenant_describe(
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
} }
async fn handle_tenant_list(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
json_response(StatusCode::OK, service.tenant_list())
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?; check_permissions(&req, Scope::Admin)?;
@@ -438,10 +411,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
check_permissions(&req, Scope::Admin)?; check_permissions(&req, Scope::Admin)?;
let state = get_state(&req); let state = get_state(&req);
let nodes = state.service.node_list().await?; json_response(StatusCode::OK, state.service.node_list().await?)
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
json_response(StatusCode::OK, api_nodes)
} }
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -508,22 +478,6 @@ async fn handle_tenant_shard_migrate(
) )
} }
async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
state
.service
.tenant_update_policy(tenant_id, update_req)
.await?,
)
}
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?; check_permissions(&req, Scope::PageServerApi)?;
@@ -533,18 +487,6 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?) json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
} }
async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let state = get_state(&req);
json_response(
StatusCode::OK,
state.service.tenant_import(tenant_id).await?,
)
}
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?; check_permissions(&req, Scope::Admin)?;
@@ -567,14 +509,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
json_response(StatusCode::OK, state.service.consistency_check().await?) json_response(StatusCode::OK, state.service.consistency_check().await?)
} }
async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
}
/// Status endpoint is just used for checking that our HTTP listener is up /// Status endpoint is just used for checking that our HTTP listener is up
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, ()) json_response(StatusCode::OK, ())
@@ -631,17 +565,9 @@ where
.await .await
} }
/// Check if the required scope is held in the request's token, or if the request has
/// a token with 'admin' scope then always permit it.
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> { fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
check_permission_with(request, |claims| { check_permission_with(request, |claims| {
match crate::auth::check_permission(claims, required_scope) { crate::auth::check_permission(claims, required_scope)
Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
Ok(()) => Ok(()),
Err(_) => Err(e),
},
Ok(()) => Ok(()),
}
}) })
} }
@@ -701,11 +627,10 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
}) })
} }
pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> { pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
let state = get_state(&req); let payload = crate::metrics::METRICS_REGISTRY.encode();
let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
let response = Response::builder() let response = Response::builder()
.status(200) .status(200)
.header(CONTENT_TYPE, TEXT_FORMAT) .header(CONTENT_TYPE, TEXT_FORMAT)
@@ -734,7 +659,6 @@ where
pub fn make_router( pub fn make_router(
service: Arc<Service>, service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>, auth: Option<Arc<SwappableJwtAuth>>,
build_info: BuildInfo,
) -> RouterBuilder<hyper::Body, ApiError> { ) -> RouterBuilder<hyper::Body, ApiError> {
let mut router = endpoint::make_router() let mut router = endpoint::make_router()
.middleware(prologue_metrics_middleware()) .middleware(prologue_metrics_middleware())
@@ -751,7 +675,7 @@ pub fn make_router(
} }
router router
.data(Arc::new(HttpState::new(service, auth, build_info))) .data(Arc::new(HttpState::new(service, auth)))
.get("/metrics", |r| { .get("/metrics", |r| {
named_request_span(r, measured_metrics_handler, RequestName("metrics")) named_request_span(r, measured_metrics_handler, RequestName("metrics"))
}) })
@@ -782,13 +706,6 @@ pub fn make_router(
.post("/debug/v1/node/:node_id/drop", |r| { .post("/debug/v1/node/:node_id/drop", |r| {
named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop")) named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
}) })
.post("/debug/v1/tenant/:tenant_id/import", |r| {
named_request_span(
r,
handle_tenant_import,
RequestName("debug_v1_tenant_import"),
)
})
.get("/debug/v1/tenant", |r| { .get("/debug/v1/tenant", |r| {
named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant")) named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
}) })
@@ -809,9 +726,6 @@ pub fn make_router(
RequestName("debug_v1_consistency_check"), RequestName("debug_v1_consistency_check"),
) )
}) })
.post("/debug/v1/reconcile_all", |r| {
request_span(r, handle_reconcile_all)
})
.put("/debug/v1/failpoints", |r| { .put("/debug/v1/failpoints", |r| {
request_span(r, |r| failpoints_handler(r, CancellationToken::new())) request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
}) })
@@ -851,16 +765,6 @@ pub fn make_router(
RequestName("control_v1_tenant_describe"), RequestName("control_v1_tenant_describe"),
) )
}) })
.get("/control/v1/tenant", |r| {
tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
})
.put("/control/v1/tenant/:tenant_id/policy", |r| {
named_request_span(
r,
handle_tenant_update_policy,
RequestName("control_v1_tenant_policy"),
)
})
// Tenant operations // Tenant operations
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
@@ -912,7 +816,7 @@ pub fn make_router(
RequestName("v1_tenant_timeline"), RequestName("v1_tenant_timeline"),
) )
}) })
// Tenant detail GET passthrough to shard zero: // Tenant detail GET passthrough to shard zero
.get("/v1/tenant/:tenant_id", |r| { .get("/v1/tenant/:tenant_id", |r| {
tenant_service_handler( tenant_service_handler(
r, r,
@@ -920,14 +824,13 @@ pub fn make_router(
RequestName("v1_tenant_passthrough"), RequestName("v1_tenant_passthrough"),
) )
}) })
// The `*` in the URL is a wildcard: any tenant/timeline GET APIs on the pageserver // Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
// are implicitly exposed here. This must be last in the list to avoid // timeline GET APIs will be implicitly included.
// taking precedence over other GET methods we might implement by hand. .get("/v1/tenant/:tenant_id/timeline*", |r| {
.get("/v1/tenant/:tenant_id/*", |r| {
tenant_service_handler( tenant_service_handler(
r, r,
handle_tenant_timeline_passthrough, handle_tenant_timeline_passthrough,
RequestName("v1_tenant_passthrough"), RequestName("v1_tenant_timeline_passthrough"),
) )
}) })
} }

View File

@@ -0,0 +1,54 @@
use std::{collections::HashMap, sync::Arc};
/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
/// is needed at a tenant-wide granularity.
pub(crate) struct IdLockMap<T>
where
T: Eq + PartialEq + std::hash::Hash,
{
/// A synchronous lock for getting/setting the async locks that our callers will wait on.
entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
}
impl<T> IdLockMap<T>
where
T: Eq + PartialEq + std::hash::Hash,
{
pub(crate) fn shared(
&self,
key: T,
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
let mut locked = self.entities.lock().unwrap();
let entry = locked.entry(key).or_default();
entry.clone().read_owned()
}
pub(crate) fn exclusive(
&self,
key: T,
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
let mut locked = self.entities.lock().unwrap();
let entry = locked.entry(key).or_default();
entry.clone().write_owned()
}
/// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
/// periodic housekeeping to avoid the map growing indefinitely
pub(crate) fn housekeeping(&self) {
let mut locked = self.entities.lock().unwrap();
locked.retain(|_k, lock| lock.try_write().is_err())
}
}
impl<T> Default for IdLockMap<T>
where
T: Eq + PartialEq + std::hash::Hash,
{
fn default() -> Self {
Self {
entities: std::sync::Mutex::new(HashMap::new()),
}
}
}

View File

@@ -14,7 +14,7 @@ mod reconciler;
mod scheduler; mod scheduler;
mod schema; mod schema;
pub mod service; pub mod service;
mod tenant_shard; mod tenant_state;
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)] #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
struct Sequence(u64); struct Sequence(u64);

View File

@@ -1,22 +1,18 @@
use anyhow::{anyhow, Context}; use anyhow::{anyhow, Context};
use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
use camino::Utf8PathBuf; use camino::Utf8PathBuf;
use clap::Parser; use clap::Parser;
use diesel::Connection; use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp; use metrics::launch_timestamp::LaunchTimestamp;
use metrics::BuildInfo;
use std::sync::Arc; use std::sync::Arc;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
use storage_controller::service::{
Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
};
use tokio::signal::unix::SignalKind; use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat}; use utils::logging::{self, LogFormat};
use utils::sentry_init::init_sentry;
use utils::{project_build_tag, project_git_version, tcp_listener}; use utils::{project_build_tag, project_git_version, tcp_listener};
project_git_version!(GIT_VERSION); project_git_version!(GIT_VERSION);
@@ -54,7 +50,7 @@ struct Cli {
#[arg(short, long)] #[arg(short, long)]
path: Option<Utf8PathBuf>, path: Option<Utf8PathBuf>,
/// URL to connect to postgres, like postgresql://localhost:1234/storage_controller /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
#[arg(long)] #[arg(long)]
database_url: Option<String>, database_url: Option<String>,
@@ -65,18 +61,6 @@ struct Cli {
/// Grace period before marking unresponsive pageserver offline /// Grace period before marking unresponsive pageserver offline
#[arg(long)] #[arg(long)]
max_unavailable_interval: Option<humantime::Duration>, max_unavailable_interval: Option<humantime::Duration>,
/// Size threshold for automatically splitting shards (disabled by default)
#[arg(long)]
split_threshold: Option<u64>,
/// Maximum number of reconcilers that may run in parallel
#[arg(long)]
reconciler_concurrency: Option<usize>,
/// How long to wait for the initial database connection to be available.
#[arg(long, default_value = "5s")]
db_connect_timeout: humantime::Duration,
} }
enum StrictMode { enum StrictMode {
@@ -174,8 +158,6 @@ fn main() -> anyhow::Result<()> {
std::process::exit(1); std::process::exit(1);
})); }));
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
tokio::runtime::Builder::new_current_thread() tokio::runtime::Builder::new_current_thread()
// We use spawn_blocking for database operations, so require approximately // We use spawn_blocking for database operations, so require approximately
// as many blocking threads as we will open database connections. // as many blocking threads as we will open database connections.
@@ -207,11 +189,6 @@ async fn async_main() -> anyhow::Result<()> {
args.listen args.listen
); );
let build_info = BuildInfo {
revision: GIT_VERSION,
build_tag: BUILD_TAG,
};
let strict_mode = if args.dev { let strict_mode = if args.dev {
StrictMode::Dev StrictMode::Dev
} else { } else {
@@ -256,15 +233,9 @@ async fn async_main() -> anyhow::Result<()> {
.max_unavailable_interval .max_unavailable_interval
.map(humantime::Duration::into) .map(humantime::Duration::into)
.unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT), .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
reconciler_concurrency: args
.reconciler_concurrency
.unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
split_threshold: args.split_threshold,
}; };
// After loading secrets & config, but before starting anything else, apply database migrations // After loading secrets & config, but before starting anything else, apply database migrations
Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
migration_run(&secrets.database_url) migration_run(&secrets.database_url)
.await .await
.context("Running database migrations")?; .context("Running database migrations")?;
@@ -279,7 +250,7 @@ async fn async_main() -> anyhow::Result<()> {
let auth = secrets let auth = secrets
.public_key .public_key
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
let router = make_router(service.clone(), auth, build_info) let router = make_router(service.clone(), auth)
.build() .build()
.map_err(|err| anyhow!(err))?; .map_err(|err| anyhow!(err))?;
let router_service = utils::http::RouterService::new(router).unwrap(); let router_service = utils::http::RouterService::new(router).unwrap();

View File

@@ -8,8 +8,10 @@
//! The rest of the code defines label group types and deals with converting outer types to labels. //! The rest of the code defines label group types and deals with converting outer types to labels.
//! //!
use bytes::Bytes; use bytes::Bytes;
use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; use measured::{
use metrics::NeonMetrics; label::{LabelValue, StaticLabelSet},
FixedCardinalityLabel, MetricGroup,
};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use std::sync::Mutex; use std::sync::Mutex;
@@ -24,28 +26,21 @@ pub fn preinitialize_metrics() {
pub(crate) struct StorageControllerMetrics { pub(crate) struct StorageControllerMetrics {
pub(crate) metrics_group: StorageControllerMetricGroup, pub(crate) metrics_group: StorageControllerMetricGroup,
encoder: Mutex<measured::text::BufferedTextEncoder>, encoder: Mutex<measured::text::TextEncoder>,
} }
#[derive(measured::MetricGroup)] #[derive(measured::MetricGroup)]
#[metric(new())]
pub(crate) struct StorageControllerMetricGroup { pub(crate) struct StorageControllerMetricGroup {
/// Count of how many times we spawn a reconcile task /// Count of how many times we spawn a reconcile task
pub(crate) storage_controller_reconcile_spawn: measured::Counter, pub(crate) storage_controller_reconcile_spawn: measured::Counter,
/// Reconciler tasks completed, broken down by success/failure/cancelled /// Reconciler tasks completed, broken down by success/failure/cancelled
pub(crate) storage_controller_reconcile_complete: pub(crate) storage_controller_reconcile_complete:
measured::CounterVec<ReconcileCompleteLabelGroupSet>, measured::CounterVec<ReconcileCompleteLabelGroupSet>,
/// Count of how many times we make an optimization change to a tenant's scheduling
pub(crate) storage_controller_schedule_optimization: measured::Counter,
/// HTTP request status counters for handled requests /// HTTP request status counters for handled requests
pub(crate) storage_controller_http_request_status: pub(crate) storage_controller_http_request_status:
measured::CounterVec<HttpRequestStatusLabelGroupSet>, measured::CounterVec<HttpRequestStatusLabelGroupSet>,
/// HTTP request handler latency across all status codes /// HTTP request handler latency across all status codes
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_http_request_latency: pub(crate) storage_controller_http_request_latency:
measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>, measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
@@ -57,7 +52,6 @@ pub(crate) struct StorageControllerMetricGroup {
/// Latency of HTTP requests to the pageserver, broken down by pageserver /// Latency of HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful /// node id, request name and method. This include both successful and unsuccessful
/// requests. /// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_pageserver_request_latency: pub(crate) storage_controller_pageserver_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>, measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
@@ -69,7 +63,6 @@ pub(crate) struct StorageControllerMetricGroup {
/// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful /// node id, request name and method. This include both successful and unsuccessful
/// requests. /// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_passthrough_request_latency: pub(crate) storage_controller_passthrough_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>, measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
@@ -78,34 +71,75 @@ pub(crate) struct StorageControllerMetricGroup {
measured::CounterVec<DatabaseQueryErrorLabelGroupSet>, measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
/// Latency of database queries, broken down by operation. /// Latency of database queries, broken down by operation.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_database_query_latency: pub(crate) storage_controller_database_query_latency:
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>, measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
} }
impl StorageControllerMetrics { impl StorageControllerMetrics {
pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes { pub(crate) fn encode(&self) -> Bytes {
let mut encoder = self.encoder.lock().unwrap(); let mut encoder = self.encoder.lock().unwrap();
neon_metrics self.metrics_group.collect_into(&mut *encoder);
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
self.metrics_group
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
encoder.finish() encoder.finish()
} }
} }
impl Default for StorageControllerMetrics { impl Default for StorageControllerMetrics {
fn default() -> Self { fn default() -> Self {
let mut metrics_group = StorageControllerMetricGroup::new();
metrics_group
.storage_controller_reconcile_complete
.init_all_dense();
Self { Self {
metrics_group, metrics_group: StorageControllerMetricGroup::new(),
encoder: Mutex::new(measured::text::BufferedTextEncoder::new()), encoder: Mutex::new(measured::text::TextEncoder::new()),
}
}
}
impl StorageControllerMetricGroup {
pub(crate) fn new() -> Self {
Self {
storage_controller_reconcile_spawn: measured::Counter::new(),
storage_controller_reconcile_complete: measured::CounterVec::new(
ReconcileCompleteLabelGroupSet {
status: StaticLabelSet::new(),
},
),
storage_controller_http_request_status: measured::CounterVec::new(
HttpRequestStatusLabelGroupSet {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
status: StaticLabelSet::new(),
},
),
storage_controller_http_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_pageserver_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_pageserver_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_passthrough_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_passthrough_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_database_query_error: measured::CounterVec::new(
DatabaseQueryErrorLabelGroupSet {
operation: StaticLabelSet::new(),
error_type: StaticLabelSet::new(),
},
),
storage_controller_database_query_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
} }
} }
} }
@@ -119,7 +153,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
#[derive(measured::LabelGroup)] #[derive(measured::LabelGroup)]
#[label(set = HttpRequestStatusLabelGroupSet)] #[label(set = HttpRequestStatusLabelGroupSet)]
pub(crate) struct HttpRequestStatusLabelGroup<'a> { pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo, default)] #[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str, pub(crate) path: &'a str,
pub(crate) method: Method, pub(crate) method: Method,
pub(crate) status: StatusCode, pub(crate) status: StatusCode,
@@ -128,21 +162,40 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[derive(measured::LabelGroup)] #[derive(measured::LabelGroup)]
#[label(set = HttpRequestLatencyLabelGroupSet)] #[label(set = HttpRequestLatencyLabelGroupSet)]
pub(crate) struct HttpRequestLatencyLabelGroup<'a> { pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo, default)] #[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str, pub(crate) path: &'a str,
pub(crate) method: Method, pub(crate) method: Method,
} }
impl Default for HttpRequestLatencyLabelGroupSet {
fn default() -> Self {
Self {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup, Clone)] #[derive(measured::LabelGroup, Clone)]
#[label(set = PageserverRequestLabelGroupSet)] #[label(set = PageserverRequestLabelGroupSet)]
pub(crate) struct PageserverRequestLabelGroup<'a> { pub(crate) struct PageserverRequestLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo, default)] #[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) pageserver_id: &'a str, pub(crate) pageserver_id: &'a str,
#[label(dynamic_with = lasso::ThreadedRodeo, default)] #[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str, pub(crate) path: &'a str,
pub(crate) method: Method, pub(crate) method: Method,
} }
impl Default for PageserverRequestLabelGroupSet {
fn default() -> Self {
Self {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup)] #[derive(measured::LabelGroup)]
#[label(set = DatabaseQueryErrorLabelGroupSet)] #[label(set = DatabaseQueryErrorLabelGroupSet)]
pub(crate) struct DatabaseQueryErrorLabelGroup { pub(crate) struct DatabaseQueryErrorLabelGroup {
@@ -156,7 +209,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
pub(crate) operation: DatabaseOperation, pub(crate) operation: DatabaseOperation,
} }
#[derive(FixedCardinalityLabel, Clone, Copy)] #[derive(FixedCardinalityLabel)]
pub(crate) enum ReconcileOutcome { pub(crate) enum ReconcileOutcome {
#[label(rename = "ok")] #[label(rename = "ok")]
Success, Success,
@@ -164,7 +217,7 @@ pub(crate) enum ReconcileOutcome {
Cancel, Cancel,
} }
#[derive(FixedCardinalityLabel, Copy, Clone)] #[derive(FixedCardinalityLabel, Clone)]
pub(crate) enum Method { pub(crate) enum Method {
Get, Get,
Put, Put,
@@ -189,12 +242,11 @@ impl From<hyper::Method> for Method {
} }
} }
#[derive(Clone, Copy)]
pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
impl LabelValue for StatusCode { impl LabelValue for StatusCode {
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output { fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0.as_u16() as i64) v.write_int(self.0.as_u16() as u64)
} }
} }
@@ -212,7 +264,7 @@ impl FixedCardinalityLabel for StatusCode {
} }
} }
#[derive(FixedCardinalityLabel, Clone, Copy)] #[derive(FixedCardinalityLabel)]
pub(crate) enum DatabaseErrorLabel { pub(crate) enum DatabaseErrorLabel {
Query, Query,
Connection, Connection,

View File

@@ -1,14 +1,13 @@
use std::{str::FromStr, time::Duration}; use std::{str::FromStr, time::Duration};
use hyper::StatusCode;
use pageserver_api::{ use pageserver_api::{
controller_api::{ controller_api::{
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
TenantLocateResponseShard,
}, },
shard::TenantShardId, shard::TenantShardId,
}; };
use pageserver_client::mgmt_api; use pageserver_client::mgmt_api;
use reqwest::StatusCode;
use serde::Serialize; use serde::Serialize;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use utils::{backoff, id::NodeId}; use utils::{backoff, id::NodeId};
@@ -257,19 +256,6 @@ impl Node {
) )
.await .await
} }
/// Generate the simplified API-friendly description of a node's state
pub(crate) fn describe(&self) -> NodeDescribeResponse {
NodeDescribeResponse {
id: self.id,
availability: self.availability.into(),
scheduling: self.scheduling,
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port,
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port,
}
}
} }
impl std::fmt::Display for Node { impl std::fmt::Display for Node {

View File

@@ -1,14 +1,13 @@
use pageserver_api::{ use pageserver_api::{
models::{ models::{
LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
}, },
shard::TenantShardId, shard::TenantShardId,
}; };
use pageserver_client::mgmt_api::{Client, Result}; use pageserver_client::mgmt_api::{Client, Result};
use reqwest::StatusCode; use reqwest::StatusCode;
use utils::id::{NodeId, TenantId, TimelineId}; use utils::id::{NodeId, TimelineId};
/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
/// controller to collect metrics in a non-intrusive manner. /// controller to collect metrics in a non-intrusive manner.
@@ -89,18 +88,6 @@ impl PageserverClient {
) )
} }
pub(crate) async fn tenant_scan_remote_storage(
&self,
tenant_id: TenantId,
) -> Result<TenantScanRemoteStorageResponse> {
measured_request!(
"tenant_scan_remote_storage",
crate::metrics::Method::Get,
&self.node_id_label,
self.inner.tenant_scan_remote_storage(tenant_id).await
)
}
pub(crate) async fn tenant_secondary_download( pub(crate) async fn tenant_secondary_download(
&self, &self,
tenant_id: TenantShardId, tenant_id: TenantShardId,
@@ -114,27 +101,6 @@ impl PageserverClient {
) )
} }
pub(crate) async fn tenant_secondary_status(
&self,
tenant_shard_id: TenantShardId,
) -> Result<SecondaryProgress> {
measured_request!(
"tenant_secondary_status",
crate::metrics::Method::Get,
&self.node_id_label,
self.inner.tenant_secondary_status(tenant_shard_id).await
)
}
pub(crate) async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
measured_request!(
"tenant_heatmap_upload",
crate::metrics::Method::Post,
&self.node_id_label,
self.inner.tenant_heatmap_upload(tenant_id).await
)
}
pub(crate) async fn location_config( pub(crate) async fn location_config(
&self, &self,
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
@@ -234,16 +200,4 @@ impl PageserverClient {
self.inner.get_utilization().await self.inner.get_utilization().await
) )
} }
pub(crate) async fn top_tenant_shards(
&self,
request: TopTenantShardsRequest,
) -> Result<TopTenantShardsResponse> {
measured_request!(
"top_tenants",
crate::metrics::Method::Post,
&self.node_id_label,
self.inner.top_tenant_shards(request).await
)
}
} }

View File

@@ -2,7 +2,6 @@ pub(crate) mod split_state;
use std::collections::HashMap; use std::collections::HashMap;
use std::str::FromStr; use std::str::FromStr;
use std::time::Duration; use std::time::Duration;
use std::time::Instant;
use self::split_state::SplitState; use self::split_state::SplitState;
use camino::Utf8Path; use camino::Utf8Path;
@@ -10,7 +9,6 @@ use camino::Utf8PathBuf;
use diesel::pg::PgConnection; use diesel::pg::PgConnection;
use diesel::prelude::*; use diesel::prelude::*;
use diesel::Connection; use diesel::Connection;
use pageserver_api::controller_api::ShardSchedulingPolicy;
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
use pageserver_api::models::TenantConfig; use pageserver_api::models::TenantConfig;
use pageserver_api::shard::ShardConfigError; use pageserver_api::shard::ShardConfigError;
@@ -80,7 +78,7 @@ pub(crate) enum DatabaseError {
Logical(String), Logical(String),
} }
#[derive(measured::FixedCardinalityLabel, Copy, Clone)] #[derive(measured::FixedCardinalityLabel, Clone)]
pub(crate) enum DatabaseOperation { pub(crate) enum DatabaseOperation {
InsertNode, InsertNode,
UpdateNode, UpdateNode,
@@ -109,12 +107,6 @@ pub(crate) enum AbortShardSplitStatus {
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>; pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
/// Some methods can operate on either a whole tenant or a single shard
pub(crate) enum TenantFilter {
Tenant(TenantId),
Shard(TenantShardId),
}
impl Persistence { impl Persistence {
// The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under
// normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect.
@@ -145,31 +137,6 @@ impl Persistence {
} }
} }
/// A helper for use during startup, where we would like to tolerate concurrent restarts of the
/// database and the storage controller, therefore the database might not be available right away
pub async fn await_connection(
database_url: &str,
timeout: Duration,
) -> Result<(), diesel::ConnectionError> {
let started_at = Instant::now();
loop {
match PgConnection::establish(database_url) {
Ok(_) => {
tracing::info!("Connected to database.");
return Ok(());
}
Err(e) => {
if started_at.elapsed() > timeout {
return Err(e);
} else {
tracing::info!("Database not yet available, waiting... ({e})");
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
}
}
}
/// Wraps `with_conn` in order to collect latency and error metrics /// Wraps `with_conn` in order to collect latency and error metrics
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R> async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
where where
@@ -179,7 +146,9 @@ impl Persistence {
let latency = &METRICS_REGISTRY let latency = &METRICS_REGISTRY
.metrics_group .metrics_group
.storage_controller_database_query_latency; .storage_controller_database_query_latency;
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
operation: op.clone(),
});
let res = self.with_conn(func).await; let res = self.with_conn(func).await;
@@ -202,45 +171,10 @@ impl Persistence {
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static, F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
R: Send + 'static, R: Send + 'static,
{ {
// A generous allowance for how many times we may retry serializable transactions
// before giving up. This is not expected to be hit: it is a defensive measure in case we
// somehow engineer a situation where duelling transactions might otherwise live-lock.
const MAX_RETRIES: usize = 128;
let mut conn = self.connection_pool.get()?; let mut conn = self.connection_pool.get()?;
tokio::task::spawn_blocking(move || -> DatabaseResult<R> { tokio::task::spawn_blocking(move || -> DatabaseResult<R> { func(&mut conn) })
let mut retry_count = 0; .await
loop { .expect("Task panic")
match conn.build_transaction().serializable().run(|c| func(c)) {
Ok(r) => break Ok(r),
Err(
err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
diesel::result::DatabaseErrorKind::SerializationFailure,
_,
)),
) => {
retry_count += 1;
if retry_count > MAX_RETRIES {
tracing::error!(
"Exceeded max retries on SerializationFailure errors: {err:?}"
);
break Err(err);
} else {
// Retry on serialization errors: these are expected, because even though our
// transactions don't fight for the same rows, they will occasionally collide
// on index pages (e.g. increment_generation for unrelated shards can collide)
tracing::debug!(
"Retrying transaction on serialization failure {err:?}"
);
continue;
}
}
Err(e) => break Err(e),
}
}
})
.await
.expect("Task panic")
} }
/// When a node is first registered, persist it before using it for anything /// When a node is first registered, persist it before using it for anything
@@ -341,11 +275,6 @@ impl Persistence {
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165 // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
shard.placement_policy = "{\"Attached\":0}".to_string(); shard.placement_policy = "{\"Attached\":0}".to_string();
} }
if shard.scheduling_policy.is_empty() {
shard.scheduling_policy =
serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
}
} }
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect(); let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
@@ -393,11 +322,14 @@ impl Persistence {
self.with_measured_conn( self.with_measured_conn(
DatabaseOperation::InsertTenantShards, DatabaseOperation::InsertTenantShards,
move |conn| -> DatabaseResult<()> { move |conn| -> DatabaseResult<()> {
for tenant in &shards { conn.transaction(|conn| -> QueryResult<()> {
diesel::insert_into(tenant_shards) for tenant in &shards {
.values(tenant) diesel::insert_into(tenant_shards)
.execute(conn)?; .values(tenant)
} .execute(conn)?;
}
Ok(())
})?;
Ok(()) Ok(())
}, },
) )
@@ -533,48 +465,59 @@ impl Persistence {
/// that we only do the first time a tenant is set to an attached policy via /location_config. /// that we only do the first time a tenant is set to an attached policy via /location_config.
pub(crate) async fn update_tenant_shard( pub(crate) async fn update_tenant_shard(
&self, &self,
tenant: TenantFilter, tenant_shard_id: TenantShardId,
input_placement_policy: Option<PlacementPolicy>, input_placement_policy: PlacementPolicy,
input_config: Option<TenantConfig>, input_config: TenantConfig,
input_generation: Option<Generation>, input_generation: Option<Generation>,
input_scheduling_policy: Option<ShardSchedulingPolicy>,
) -> DatabaseResult<()> { ) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*; use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| { self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
let query = match tenant { let query = diesel::update(tenant_shards)
TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards) .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.into_boxed(),
TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
.filter(tenant_id.eq(input_tenant_id.to_string()))
.into_boxed(),
};
#[derive(AsChangeset)] if let Some(input_generation) = input_generation {
#[diesel(table_name = crate::schema::tenant_shards)] // Update includes generation column
struct ShardUpdate { query
generation: Option<i32>, .set((
placement_policy: Option<String>, generation.eq(Some(input_generation.into().unwrap() as i32)),
config: Option<String>, placement_policy
scheduling_policy: Option<String>, .eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
} else {
// Update does not include generation column
query
.set((
placement_policy
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
} }
let update = ShardUpdate { Ok(())
generation: input_generation.map(|g| g.into().unwrap() as i32), })
placement_policy: input_placement_policy .await?;
.as_ref()
.map(|p| serde_json::to_string(&p).unwrap()),
config: input_config
.as_ref()
.map(|c| serde_json::to_string(&c).unwrap()),
scheduling_policy: input_scheduling_policy
.map(|p| serde_json::to_string(&p).unwrap()),
};
query.set(update).execute(conn)?; Ok(())
}
pub(crate) async fn update_tenant_config(
&self,
input_tenant_id: TenantId,
input_config: TenantConfig,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
diesel::update(tenant_shards)
.filter(tenant_id.eq(input_tenant_id.to_string()))
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
.execute(conn)?;
Ok(()) Ok(())
}) })
@@ -616,51 +559,55 @@ impl Persistence {
) -> DatabaseResult<()> { ) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*; use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> { self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
// Mark parent shards as splitting conn.transaction(|conn| -> DatabaseResult<()> {
// Mark parent shards as splitting
let updated = diesel::update(tenant_shards) let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string())) .filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(old_shard_count.literal() as i32)) .filter(shard_count.eq(old_shard_count.literal() as i32))
.set((splitting.eq(1),)) .set((splitting.eq(1),))
.execute(conn)?; .execute(conn)?;
if u8::try_from(updated) if u8::try_from(updated)
.map_err(|_| DatabaseError::Logical( .map_err(|_| DatabaseError::Logical(
format!("Overflow existing shard count {} while splitting", updated)) format!("Overflow existing shard count {} while splitting", updated))
)? != old_shard_count.count() { )? != old_shard_count.count() {
// Perhaps a deletion or another split raced with this attempt to split, mutating // Perhaps a deletion or another split raced with this attempt to split, mutating
// the parent shards that we intend to split. In this case the split request should fail. // the parent shards that we intend to split. In this case the split request should fail.
return Err(DatabaseError::Logical( return Err(DatabaseError::Logical(
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count()) format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
)); ));
}
// FIXME: spurious clone to sidestep closure move rules
let parent_to_children = parent_to_children.clone();
// Insert child shards
for (parent_shard_id, children) in parent_to_children {
let mut parent = crate::schema::tenant_shards::table
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
.load::<TenantShardPersistence>(conn)?;
let parent = if parent.len() != 1 {
return Err(DatabaseError::Logical(format!(
"Parent shard {parent_shard_id} not found"
)));
} else {
parent.pop().unwrap()
};
for mut shard in children {
// Carry the parent's generation into the child
shard.generation = parent.generation;
debug_assert!(shard.splitting == SplitState::Splitting);
diesel::insert_into(tenant_shards)
.values(shard)
.execute(conn)?;
} }
}
// FIXME: spurious clone to sidestep closure move rules
let parent_to_children = parent_to_children.clone();
// Insert child shards
for (parent_shard_id, children) in parent_to_children {
let mut parent = crate::schema::tenant_shards::table
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
.load::<TenantShardPersistence>(conn)?;
let parent = if parent.len() != 1 {
return Err(DatabaseError::Logical(format!(
"Parent shard {parent_shard_id} not found"
)));
} else {
parent.pop().unwrap()
};
for mut shard in children {
// Carry the parent's generation into the child
shard.generation = parent.generation;
debug_assert!(shard.splitting == SplitState::Splitting);
diesel::insert_into(tenant_shards)
.values(shard)
.execute(conn)?;
}
}
Ok(())
})?;
Ok(()) Ok(())
}) })
@@ -678,18 +625,22 @@ impl Persistence {
self.with_measured_conn( self.with_measured_conn(
DatabaseOperation::CompleteShardSplit, DatabaseOperation::CompleteShardSplit,
move |conn| -> DatabaseResult<()> { move |conn| -> DatabaseResult<()> {
// Drop parent shards conn.transaction(|conn| -> QueryResult<()> {
diesel::delete(tenant_shards) // Drop parent shards
.filter(tenant_id.eq(split_tenant_id.to_string())) diesel::delete(tenant_shards)
.filter(shard_count.eq(old_shard_count.literal() as i32)) .filter(tenant_id.eq(split_tenant_id.to_string()))
.execute(conn)?; .filter(shard_count.eq(old_shard_count.literal() as i32))
.execute(conn)?;
// Clear sharding flag // Clear sharding flag
let updated = diesel::update(tenant_shards) let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string())) .filter(tenant_id.eq(split_tenant_id.to_string()))
.set((splitting.eq(0),)) .set((splitting.eq(0),))
.execute(conn)?; .execute(conn)?;
debug_assert!(updated > 0); debug_assert!(updated > 0);
Ok(())
})?;
Ok(()) Ok(())
}, },
@@ -708,41 +659,46 @@ impl Persistence {
self.with_measured_conn( self.with_measured_conn(
DatabaseOperation::AbortShardSplit, DatabaseOperation::AbortShardSplit,
move |conn| -> DatabaseResult<AbortShardSplitStatus> { move |conn| -> DatabaseResult<AbortShardSplitStatus> {
// Clear the splitting state on parent shards let aborted =
let updated = diesel::update(tenant_shards) conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
.filter(tenant_id.eq(split_tenant_id.to_string())) // Clear the splitting state on parent shards
.filter(shard_count.ne(new_shard_count.literal() as i32)) let updated = diesel::update(tenant_shards)
.set((splitting.eq(0),)) .filter(tenant_id.eq(split_tenant_id.to_string()))
.execute(conn)?; .filter(shard_count.ne(new_shard_count.literal() as i32))
.set((splitting.eq(0),))
.execute(conn)?;
// Parent shards are already gone: we cannot abort. // Parent shards are already gone: we cannot abort.
if updated == 0 { if updated == 0 {
return Ok(AbortShardSplitStatus::Complete); return Ok(AbortShardSplitStatus::Complete);
} }
// Sanity check: if parent shards were present, their cardinality should // Sanity check: if parent shards were present, their cardinality should
// be less than the number of child shards. // be less than the number of child shards.
if updated >= new_shard_count.count() as usize { if updated >= new_shard_count.count() as usize {
return Err(DatabaseError::Logical(format!( return Err(DatabaseError::Logical(format!(
"Unexpected parent shard count {updated} while aborting split to \ "Unexpected parent shard count {updated} while aborting split to \
count {new_shard_count:?} on tenant {split_tenant_id}" count {new_shard_count:?} on tenant {split_tenant_id}"
))); )));
} }
// Erase child shards // Erase child shards
diesel::delete(tenant_shards) diesel::delete(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string())) .filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(new_shard_count.literal() as i32)) .filter(shard_count.eq(new_shard_count.literal() as i32))
.execute(conn)?; .execute(conn)?;
Ok(AbortShardSplitStatus::Aborted) Ok(AbortShardSplitStatus::Aborted)
})?;
Ok(aborted)
}, },
) )
.await .await
} }
} }
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
#[diesel(table_name = crate::schema::tenant_shards)] #[diesel(table_name = crate::schema::tenant_shards)]
pub(crate) struct TenantShardPersistence { pub(crate) struct TenantShardPersistence {
@@ -772,8 +728,6 @@ pub(crate) struct TenantShardPersistence {
pub(crate) splitting: SplitState, pub(crate) splitting: SplitState,
#[serde(default)] #[serde(default)]
pub(crate) config: String, pub(crate) config: String,
#[serde(default)]
pub(crate) scheduling_policy: String,
} }
impl TenantShardPersistence { impl TenantShardPersistence {

View File

@@ -1,12 +1,12 @@
use crate::pageserver_client::PageserverClient; use crate::pageserver_client::PageserverClient;
use crate::persistence::Persistence; use crate::persistence::Persistence;
use crate::service; use crate::service;
use hyper::StatusCode;
use pageserver_api::models::{ use pageserver_api::models::{
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
}; };
use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_api::shard::{ShardIdentity, TenantShardId};
use pageserver_client::mgmt_api; use pageserver_client::mgmt_api;
use reqwest::StatusCode;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
@@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard;
use crate::compute_hook::{ComputeHook, NotifyError}; use crate::compute_hook::{ComputeHook, NotifyError};
use crate::node::Node; use crate::node::Node;
use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation}; use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
const DEFAULT_HEATMAP_PERIOD: &str = "60s"; const DEFAULT_HEATMAP_PERIOD: &str = "60s";
/// Object with the lifetime of the background reconcile task that is created /// Object with the lifetime of the background reconcile task that is created
/// for tenants which have a difference between their intent and observed states. /// for tenants which have a difference between their intent and observed states.
pub(super) struct Reconciler { pub(super) struct Reconciler {
/// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
/// of a tenant's state from when we spawned a reconcile task. /// of a tenant's state from when we spawned a reconcile task.
pub(super) tenant_shard_id: TenantShardId, pub(super) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity, pub(crate) shard: ShardIdentity,
@@ -48,15 +48,11 @@ pub(super) struct Reconciler {
/// To avoid stalling if the cloud control plane is unavailable, we may proceed /// To avoid stalling if the cloud control plane is unavailable, we may proceed
/// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
/// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
pub(crate) compute_notify_failure: bool, pub(crate) compute_notify_failure: bool,
/// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many
/// we will spawn.
pub(crate) _resource_units: ReconcileUnits,
/// A means to abort background reconciliation: it is essential to /// A means to abort background reconciliation: it is essential to
/// call this when something changes in the original TenantShard that /// call this when something changes in the original TenantState that
/// will make this reconciliation impossible or unnecessary, for /// will make this reconciliation impossible or unnecessary, for
/// example when a pageserver node goes offline, or the PlacementPolicy for /// example when a pageserver node goes offline, or the PlacementPolicy for
/// the tenant is changed. /// the tenant is changed.
@@ -70,20 +66,7 @@ pub(super) struct Reconciler {
pub(crate) persistence: Arc<Persistence>, pub(crate) persistence: Arc<Persistence>,
} }
/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O /// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
pub(crate) struct ReconcileUnits {
_sem_units: tokio::sync::OwnedSemaphorePermit,
}
impl ReconcileUnits {
pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self {
Self {
_sem_units: sem_units,
}
}
}
/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
/// reference counting for Scheduler. The IntentState is what the scheduler works with, /// reference counting for Scheduler. The IntentState is what the scheduler works with,
/// and the TargetState is just the instruction for a particular Reconciler run. /// and the TargetState is just the instruction for a particular Reconciler run.
#[derive(Debug)] #[derive(Debug)]
@@ -504,7 +487,6 @@ impl Reconciler {
while let Err(e) = self.compute_notify().await { while let Err(e) = self.compute_notify().await {
match e { match e {
NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
_ => { _ => {
tracing::warn!( tracing::warn!(
"Live migration blocked by compute notification error, retrying: {e}" "Live migration blocked by compute notification error, retrying: {e}"
@@ -767,10 +749,7 @@ impl Reconciler {
// It is up to the caller whether they want to drop out on this error, but they don't have to: // It is up to the caller whether they want to drop out on this error, but they don't have to:
// in general we should avoid letting unavailability of the cloud control plane stop us from // in general we should avoid letting unavailability of the cloud control plane stop us from
// making progress. // making progress.
if !matches!(e, NotifyError::ShuttingDown) { tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
}
// Set this flag so that in our ReconcileResult we will set the flag on the shard that it // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
// needs to retry at some point. // needs to retry at some point.
self.compute_notify_failure = true; self.compute_notify_failure = true;

View File

@@ -1,4 +1,4 @@
use crate::{node::Node, tenant_shard::TenantShard}; use crate::{node::Node, tenant_state::TenantState};
use pageserver_api::controller_api::UtilizationScore; use pageserver_api::controller_api::UtilizationScore;
use serde::Serialize; use serde::Serialize;
use std::collections::HashMap; use std::collections::HashMap;
@@ -27,7 +27,7 @@ pub enum MaySchedule {
#[derive(Serialize)] #[derive(Serialize)]
struct SchedulerNode { struct SchedulerNode {
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`]. /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
shard_count: usize, shard_count: usize,
/// Whether this node is currently elegible to have new shards scheduled (this is derived /// Whether this node is currently elegible to have new shards scheduled (this is derived
@@ -58,86 +58,6 @@ pub(crate) struct Scheduler {
nodes: HashMap<NodeId, SchedulerNode>, nodes: HashMap<NodeId, SchedulerNode>,
} }
/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
///
/// For example, we may set an affinity score based on the number of shards from the same
/// tenant already on a node, to implicitly prefer to balance out shards.
#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
pub(crate) struct AffinityScore(pub(crate) usize);
impl AffinityScore {
/// If we have no anti-affinity at all toward a node, this is its score. It means
/// the scheduler has a free choice amongst nodes with this score, and may pick a node
/// based on other information such as total utilization.
pub(crate) const FREE: Self = Self(0);
pub(crate) fn inc(&mut self) {
self.0 += 1;
}
}
impl std::ops::Add for AffinityScore {
type Output = Self;
fn add(self, rhs: Self) -> Self::Output {
Self(self.0 + rhs.0)
}
}
/// Hint for whether this is a sincere attempt to schedule, or a speculative
/// check for where we _would_ schedule (done during optimization)
#[derive(Debug)]
pub(crate) enum ScheduleMode {
Normal,
Speculative,
}
impl Default for ScheduleMode {
fn default() -> Self {
Self::Normal
}
}
// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
// it for many shards in the same tenant.
#[derive(Debug, Default)]
pub(crate) struct ScheduleContext {
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
/// Specifically how many _attached_ locations are on each node
pub(crate) attached_nodes: HashMap<NodeId, usize>,
pub(crate) mode: ScheduleMode,
}
impl ScheduleContext {
/// Input is a list of nodes we would like to avoid using again within this context. The more
/// times a node is passed into this call, the less inclined we are to use it.
pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
for node_id in nodes {
let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
entry.inc()
}
}
pub(crate) fn push_attached(&mut self, node_id: NodeId) {
let entry = self.attached_nodes.entry(node_id).or_default();
*entry += 1;
}
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
self.nodes
.get(&node_id)
.copied()
.unwrap_or(AffinityScore::FREE)
}
pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
self.attached_nodes.get(&node_id).copied().unwrap_or(0)
}
}
impl Scheduler { impl Scheduler {
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self { pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
let mut scheduler_nodes = HashMap::new(); let mut scheduler_nodes = HashMap::new();
@@ -163,7 +83,7 @@ impl Scheduler {
pub(crate) fn consistency_check<'a>( pub(crate) fn consistency_check<'a>(
&self, &self,
nodes: impl Iterator<Item = &'a Node>, nodes: impl Iterator<Item = &'a Node>,
shards: impl Iterator<Item = &'a TenantShard>, shards: impl Iterator<Item = &'a TenantState>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new(); let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
for node in nodes { for node in nodes {
@@ -304,87 +224,53 @@ impl Scheduler {
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None }) node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
} }
/// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
/// are already in use by this shard -- we use this to avoid picking the same node
/// as both attached and secondary location. This is a hard constraint: if we cannot
/// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
///
/// context: we prefer to avoid using nodes identified in the context, according
/// to their anti-affinity score. We use this to prefeer to avoid placing shards in
/// the same tenant on the same node. This is a soft constraint: the context will never
/// cause us to fail to schedule a shard.
pub(crate) fn schedule_shard(
&self,
hard_exclude: &[NodeId],
context: &ScheduleContext,
) -> Result<NodeId, ScheduleError> {
if self.nodes.is_empty() { if self.nodes.is_empty() {
return Err(ScheduleError::NoPageservers); return Err(ScheduleError::NoPageservers);
} }
let mut scores: Vec<(NodeId, AffinityScore, usize)> = self let mut tenant_counts: Vec<(NodeId, usize)> = self
.nodes .nodes
.iter() .iter()
.filter_map(|(k, v)| { .filter_map(|(k, v)| {
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No { if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
None None
} else { } else {
Some(( Some((*k, v.shard_count))
*k,
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
v.shard_count,
))
} }
}) })
.collect(); .collect();
// Sort by, in order of precedence: // Sort by tenant count. Nodes with the same tenant count are sorted by ID.
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available tenant_counts.sort_by_key(|i| (i.1, i.0));
// 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes.
// 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
scores.sort_by_key(|i| (i.1, i.2, i.0));
if scores.is_empty() { if tenant_counts.is_empty() {
// After applying constraints, no pageservers were left. // After applying constraints, no pageservers were left. We log some detail about
if !matches!(context.mode, ScheduleMode::Speculative) { // the state of nodes to help understand why this happened. This is not logged as an error because
// If this was not a speculative attempt, log details to understand why we couldn't // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
// schedule: this may help an engineer understand if some nodes are marked offline tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
// in a way that's preventing progress. for (node_id, node) in &self.nodes {
tracing::info!( tracing::info!(
"Scheduling failure, while excluding {hard_exclude:?}, node states:" "Node {node_id}: may_schedule={} shards={}",
node.may_schedule != MaySchedule::No,
node.shard_count
); );
for (node_id, node) in &self.nodes {
tracing::info!(
"Node {node_id}: may_schedule={} shards={}",
node.may_schedule != MaySchedule::No,
node.shard_count
);
}
} }
return Err(ScheduleError::ImpossibleConstraint); return Err(ScheduleError::ImpossibleConstraint);
} }
// Lowest score wins let node_id = tenant_counts.first().unwrap().0;
let node_id = scores.first().unwrap().0; tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
if !matches!(context.mode, ScheduleMode::Speculative) { tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
); );
}
// Note that we do not update shard count here to reflect the scheduling: that // Note that we do not update shard count here to reflect the scheduling: that
// is IntentState's job when the scheduled location is used. // is IntentState's job when the scheduled location is used.
Ok(node_id) Ok(node_id)
} }
/// Unit test access to internal state
#[cfg(test)]
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
self.nodes.get(&node_id).unwrap().shard_count
}
} }
#[cfg(test)] #[cfg(test)]
@@ -421,7 +307,7 @@ pub(crate) mod test_utils {
mod tests { mod tests {
use super::*; use super::*;
use crate::tenant_shard::IntentState; use crate::tenant_state::IntentState;
#[test] #[test]
fn scheduler_basic() -> anyhow::Result<()> { fn scheduler_basic() -> anyhow::Result<()> {
let nodes = test_utils::make_test_nodes(2); let nodes = test_utils::make_test_nodes(2);
@@ -430,17 +316,15 @@ mod tests {
let mut t1_intent = IntentState::new(); let mut t1_intent = IntentState::new();
let mut t2_intent = IntentState::new(); let mut t2_intent = IntentState::new();
let context = ScheduleContext::default(); let scheduled = scheduler.schedule_shard(&[])?;
let scheduled = scheduler.schedule_shard(&[], &context)?;
t1_intent.set_attached(&mut scheduler, Some(scheduled)); t1_intent.set_attached(&mut scheduler, Some(scheduled));
let scheduled = scheduler.schedule_shard(&[], &context)?; let scheduled = scheduler.schedule_shard(&[])?;
t2_intent.set_attached(&mut scheduler, Some(scheduled)); t2_intent.set_attached(&mut scheduler, Some(scheduled));
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1); assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?; let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
t1_intent.push_secondary(&mut scheduler, scheduled); t1_intent.push_secondary(&mut scheduler, scheduled);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);

View File

@@ -22,7 +22,6 @@ diesel::table! {
placement_policy -> Varchar, placement_policy -> Varchar,
splitting -> Int2, splitting -> Int2,
config -> Text, config -> Text,
scheduling_policy -> Varchar,
} }
} }

View File

@@ -86,10 +86,7 @@ where
.stdout(process_log_file) .stdout(process_log_file)
.stderr(same_file_for_stderr) .stderr(same_file_for_stderr)
.args(args); .args(args);
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
fill_rust_env_vars(background_command),
));
filled_cmd.envs(envs); filled_cmd.envs(envs);
let pid_file_to_check = match &initial_pid_file { let pid_file_to_check = match &initial_pid_file {
@@ -271,15 +268,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
cmd cmd
} }
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
for (var, val) in std::env::vars() {
if var.starts_with("NEON_PAGESERVER_") {
cmd = cmd.env(var, val);
}
}
cmd
}
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
/// 1. Claims a pidfile with a fcntl lock on it and /// 1. Claims a pidfile with a fcntl lock on it and
/// 2. Sets up the pidfile's file descriptor so that it (and the lock) /// 2. Sets up the pidfile's file descriptor so that it (and the lock)

View File

@@ -9,23 +9,22 @@ use anyhow::{anyhow, bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
use compute_api::spec::ComputeMode; use compute_api::spec::ComputeMode;
use control_plane::endpoint::ComputeControlPlane; use control_plane::endpoint::ComputeControlPlane;
use control_plane::local_env::{ use control_plane::local_env::{InitForceMode, LocalEnv};
InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf, use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
SafekeeperConf,
};
use control_plane::pageserver::PageServerNode;
use control_plane::safekeeper::SafekeeperNode; use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::StorageController; use control_plane::storage_controller::StorageController;
use control_plane::{broker, local_env}; use control_plane::{broker, local_env};
use pageserver_api::config::{ use pageserver_api::controller_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
}; };
use pageserver_api::controller_api::PlacementPolicy;
use pageserver_api::models::{ use pageserver_api::models::{
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
}; };
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
use pageserver_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
};
use postgres_backend::AuthType; use postgres_backend::AuthType;
use postgres_connection::parse_host_port; use postgres_connection::parse_host_port;
use safekeeper_api::{ use safekeeper_api::{
@@ -55,6 +54,44 @@ const DEFAULT_PG_VERSION: &str = "15";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
fn default_conf(num_pageservers: u16) -> String {
let mut template = format!(
r#"
# Default built-in configuration, defined in main.rs
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
[broker]
listen_addr = '{DEFAULT_BROKER_ADDR}'
[[safekeepers]]
id = {DEFAULT_SAFEKEEPER_ID}
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
"#,
);
for i in 0..num_pageservers {
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
template += &format!(
r#"
[[pageservers]]
id = {pageserver_id}
listen_pg_addr = '127.0.0.1:{pg_port}'
listen_http_addr = '127.0.0.1:{http_port}'
pg_auth_type = '{trust_auth}'
http_auth_type = '{trust_auth}'
"#,
trust_auth = AuthType::Trust,
)
}
template
}
/// ///
/// Timelines tree element used as a value in the HashMap. /// Timelines tree element used as a value in the HashMap.
/// ///
@@ -98,7 +135,7 @@ fn main() -> Result<()> {
let subcommand_result = match sub_name { let subcommand_result = match sub_name {
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(&env)), "start" => rt.block_on(handle_start_all(sub_args, &env)),
"stop" => rt.block_on(handle_stop_all(sub_args, &env)), "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)), "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -117,7 +154,7 @@ fn main() -> Result<()> {
}; };
match subcommand_result { match subcommand_result {
Ok(Some(updated_env)) => updated_env.persist_config()?, Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
Ok(None) => (), Ok(None) => (),
Err(e) => { Err(e) => {
eprintln!("command failed: {e:?}"); eprintln!("command failed: {e:?}");
@@ -306,65 +343,48 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
} }
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> { fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let num_pageservers = init_match.get_one::<u16>("num-pageservers"); let num_pageservers = init_match
.get_one::<u16>("num-pageservers")
let force = init_match.get_one("force").expect("we set a default value"); .expect("num-pageservers arg has a default");
// Create config file
// Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`. let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
let init_conf: NeonLocalInitConf = if let Some(config_path) =
init_match.get_one::<PathBuf>("config")
{
// User (likely the Python test suite) provided a description of the environment.
if num_pageservers.is_some() {
bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
}
// load and parse the file // load and parse the file
let contents = std::fs::read_to_string(config_path).with_context(|| { std::fs::read_to_string(config_path).with_context(|| {
format!( format!(
"Could not read configuration file '{}'", "Could not read configuration file '{}'",
config_path.display() config_path.display()
) )
})?; })?
toml_edit::de::from_str(&contents)?
} else { } else {
// User (likely interactive) did not provide a description of the environment, give them the default // Built-in default config
NeonLocalInitConf { default_conf(*num_pageservers)
control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())),
broker: NeonBroker {
listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
},
safekeepers: vec![SafekeeperConf {
id: DEFAULT_SAFEKEEPER_ID,
pg_port: DEFAULT_SAFEKEEPER_PG_PORT,
http_port: DEFAULT_SAFEKEEPER_HTTP_PORT,
..Default::default()
}],
pageservers: (0..num_pageservers.copied().unwrap_or(1))
.map(|i| {
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
NeonLocalInitPageserverConf {
id: pageserver_id,
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
listen_http_addr: format!("127.0.0.1:{http_port}"),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
other: Default::default(),
}
})
.collect(),
pg_distrib_dir: None,
neon_distrib_dir: None,
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
storage_controller: None,
control_plane_compute_hook_api: None,
}
}; };
LocalEnv::init(init_conf, force) let pg_version = init_match
.context("materialize initial neon_local environment on disk")?; .get_one::<u32>("pg-version")
Ok(LocalEnv::load_config().expect("freshly written config should be loadable")) .copied()
.context("Failed to parse postgres version from the argument string")?;
let mut env =
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
let force = init_match.get_one("force").expect("we set a default value");
env.init(pg_version, force)
.context("Failed to initialize neon repository")?;
// Create remote storage location for default LocalFs remote storage
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
// Initialize pageserver, create initial tenant and timeline.
for ps_conf in &env.pageservers {
PageServerNode::from_env(&env, ps_conf)
.initialize(&pageserver_config_overrides(init_match))
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {e:?}");
exit(1);
});
}
Ok(env)
} }
/// The default pageserver is the one where CLI tenant/timeline operations are sent by default. /// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
@@ -379,6 +399,15 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
PageServerNode::from_env(env, ps_conf) PageServerNode::from_env(env, ps_conf)
} }
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
init_match
.get_many::<String>("pageserver-config-override")
.into_iter()
.flatten()
.map(String::as_str)
.collect()
}
async fn handle_tenant( async fn handle_tenant(
tenant_match: &ArgMatches, tenant_match: &ArgMatches,
env: &mut local_env::LocalEnv, env: &mut local_env::LocalEnv,
@@ -390,54 +419,6 @@ async fn handle_tenant(
println!("{} {:?}", t.id, t.state); println!("{} {:?}", t.id, t.state);
} }
} }
Some(("import", import_match)) => {
let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
let storage_controller = StorageController::from_env(env);
let create_response = storage_controller.tenant_import(tenant_id).await?;
let shard_zero = create_response
.shards
.first()
.expect("Import response omitted shards");
let attached_pageserver_id = shard_zero.node_id;
let pageserver =
PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
println!(
"Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
);
let timelines = pageserver
.http_client
.list_timelines(shard_zero.shard_id)
.await?;
// Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
let main_timeline = timelines
.iter()
.find(|t| t.ancestor_timeline_id.is_none())
.expect("No timelines found")
.timeline_id;
let mut branch_i = 0;
for timeline in timelines.iter() {
let branch_name = if timeline.timeline_id == main_timeline {
"main".to_string()
} else {
branch_i += 1;
format!("branch_{branch_i}")
};
println!(
"Importing timeline {tenant_id}/{} as branch {branch_name}",
timeline.timeline_id
);
env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
}
}
Some(("create", create_match)) => { Some(("create", create_match)) => {
let tenant_conf: HashMap<_, _> = create_match let tenant_conf: HashMap<_, _> = create_match
.get_many::<String>("config") .get_many::<String>("config")
@@ -810,8 +791,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.copied() .copied()
.unwrap_or(false); .unwrap_or(false);
let allow_multiple = sub_args.get_flag("allow-multiple");
let mode = match (lsn, hot_standby) { let mode = match (lsn, hot_standby) {
(Some(lsn), false) => ComputeMode::Static(lsn), (Some(lsn), false) => ComputeMode::Static(lsn),
(None, true) => ComputeMode::Replica, (None, true) => ComputeMode::Replica,
@@ -829,9 +808,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
_ => {} _ => {}
} }
if !allow_multiple { cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
}
cplane.new_endpoint( cplane.new_endpoint(
&endpoint_id, &endpoint_id,
@@ -860,8 +837,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config"); let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
let allow_multiple = sub_args.get_flag("allow-multiple");
// If --safekeepers argument is given, use only the listed safekeeper nodes. // If --safekeepers argument is given, use only the listed safekeeper nodes.
let safekeepers = let safekeepers =
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") { if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -887,13 +862,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.cloned() .cloned()
.unwrap_or_default(); .unwrap_or_default();
if !allow_multiple { cplane.check_conflicting_endpoints(
cplane.check_conflicting_endpoints( endpoint.mode,
endpoint.mode, endpoint.tenant_id,
endpoint.tenant_id, endpoint.timeline_id,
endpoint.timeline_id, )?;
)?;
}
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id { let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
let conf = env.get_pageserver_conf(pageserver_id).unwrap(); let conf = env.get_pageserver_conf(pageserver_id).unwrap();
@@ -1049,7 +1022,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() { match sub_match.subcommand() {
Some(("start", subcommand_args)) => { Some(("start", subcommand_args)) => {
if let Err(e) = get_pageserver(env, subcommand_args)?.start().await { if let Err(e) = get_pageserver(env, subcommand_args)?
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}"); eprintln!("pageserver start failed: {e}");
exit(1); exit(1);
} }
@@ -1075,12 +1051,30 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
exit(1); exit(1);
} }
if let Err(e) = pageserver.start().await { if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}"); eprintln!("pageserver start failed: {e}");
exit(1); exit(1);
} }
} }
Some(("set-state", subcommand_args)) => {
let pageserver = get_pageserver(env, subcommand_args)?;
let scheduling = subcommand_args.get_one("scheduling");
let availability = subcommand_args.get_one("availability");
let storage_controller = StorageController::from_env(env);
storage_controller
.node_configure(NodeConfigureRequest {
node_id: pageserver.conf.id,
scheduling: scheduling.cloned(),
availability: availability.cloned(),
})
.await?;
}
Some(("status", subcommand_args)) => { Some(("status", subcommand_args)) => {
match get_pageserver(env, subcommand_args)?.check_status().await { match get_pageserver(env, subcommand_args)?.check_status().await {
Ok(_) => println!("Page server is up and running"), Ok(_) => println!("Page server is up and running"),
@@ -1202,7 +1196,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
Ok(()) Ok(())
} }
async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> { async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
// Endpoints are not started automatically // Endpoints are not started automatically
broker::start_broker_process(env).await?; broker::start_broker_process(env).await?;
@@ -1219,7 +1213,10 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
for ps_conf in &env.pageservers { for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf); let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.start().await { if let Err(e) = pageserver
.start(&pageserver_config_overrides(sub_match))
.await
{
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true).await; try_stop_all(env, true).await;
exit(1); exit(1);
@@ -1251,7 +1248,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
match ComputeControlPlane::load(env.clone()) { match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => { Ok(cplane) => {
for (_k, node) in cplane.endpoints { for (_k, node) in cplane.endpoints {
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
eprintln!("postgres stop failed: {e:#}"); eprintln!("postgres stop failed: {e:#}");
} }
} }
@@ -1360,6 +1357,13 @@ fn cli() -> Command {
.required(false) .required(false)
.value_name("stop-mode"); .value_name("stop-mode");
let pageserver_config_args = Arg::new("pageserver-config-override")
.long("pageserver-config-override")
.num_args(1)
.action(ArgAction::Append)
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
.required(false);
let remote_ext_config_args = Arg::new("remote-ext-config") let remote_ext_config_args = Arg::new("remote-ext-config")
.long("remote-ext-config") .long("remote-ext-config")
.num_args(1) .num_args(1)
@@ -1393,7 +1397,9 @@ fn cli() -> Command {
let num_pageservers_arg = Arg::new("num-pageservers") let num_pageservers_arg = Arg::new("num-pageservers")
.value_parser(value_parser!(u16)) .value_parser(value_parser!(u16))
.long("num-pageservers") .long("num-pageservers")
.help("How many pageservers to create (default 1)"); .help("How many pageservers to create (default 1)")
.required(false)
.default_value("1");
let update_catalog = Arg::new("update-catalog") let update_catalog = Arg::new("update-catalog")
.value_parser(value_parser!(bool)) .value_parser(value_parser!(bool))
@@ -1407,25 +1413,20 @@ fn cli() -> Command {
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`") .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
.required(false); .required(false);
let allow_multiple = Arg::new("allow-multiple")
.help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
.long("allow-multiple")
.action(ArgAction::SetTrue)
.required(false);
Command::new("Neon CLI") Command::new("Neon CLI")
.arg_required_else_help(true) .arg_required_else_help(true)
.version(GIT_VERSION) .version(GIT_VERSION)
.subcommand( .subcommand(
Command::new("init") Command::new("init")
.about("Initialize a new Neon repository, preparing configs for services to start with") .about("Initialize a new Neon repository, preparing configs for services to start with")
.arg(pageserver_config_args.clone())
.arg(num_pageservers_arg.clone()) .arg(num_pageservers_arg.clone())
.arg( .arg(
Arg::new("config") Arg::new("config")
.long("config") .long("config")
.required(false) .required(false)
.value_parser(value_parser!(PathBuf)) .value_parser(value_parser!(PathBuf))
.value_name("config") .value_name("config"),
) )
.arg(pg_version_arg.clone()) .arg(pg_version_arg.clone())
.arg(force_arg) .arg(force_arg)
@@ -1433,7 +1434,6 @@ fn cli() -> Command {
.subcommand( .subcommand(
Command::new("timeline") Command::new("timeline")
.about("Manage timelines") .about("Manage timelines")
.arg_required_else_help(true)
.subcommand(Command::new("list") .subcommand(Command::new("list")
.about("List all timelines, available to this pageserver") .about("List all timelines, available to this pageserver")
.arg(tenant_id_arg.clone())) .arg(tenant_id_arg.clone()))
@@ -1496,8 +1496,6 @@ fn cli() -> Command {
.subcommand(Command::new("config") .subcommand(Command::new("config")
.arg(tenant_id_arg.clone()) .arg(tenant_id_arg.clone())
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
.subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
.about("Import a tenant that is present in remote storage, and create branches for its timelines"))
) )
.subcommand( .subcommand(
Command::new("pageserver") Command::new("pageserver")
@@ -1507,6 +1505,7 @@ fn cli() -> Command {
.subcommand(Command::new("status")) .subcommand(Command::new("status"))
.subcommand(Command::new("start") .subcommand(Command::new("start")
.about("Start local pageserver") .about("Start local pageserver")
.arg(pageserver_config_args.clone())
) )
.subcommand(Command::new("stop") .subcommand(Command::new("stop")
.about("Stop local pageserver") .about("Stop local pageserver")
@@ -1514,14 +1513,21 @@ fn cli() -> Command {
) )
.subcommand(Command::new("restart") .subcommand(Command::new("restart")
.about("Restart local pageserver") .about("Restart local pageserver")
.arg(pageserver_config_args.clone())
)
.subcommand(Command::new("set-state")
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
.about("Set scheduling or availability state of pageserver node")
.arg(pageserver_config_args.clone())
) )
) )
.subcommand( .subcommand(
Command::new("storage_controller") Command::new("storage_controller")
.arg_required_else_help(true) .arg_required_else_help(true)
.about("Manage storage_controller") .about("Manage storage_controller")
.subcommand(Command::new("start").about("Start storage controller")) .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
.subcommand(Command::new("stop").about("Stop storage controller") .subcommand(Command::new("stop").about("Stop local pageserver")
.arg(stop_mode_arg.clone())) .arg(stop_mode_arg.clone()))
) )
.subcommand( .subcommand(
@@ -1567,7 +1573,6 @@ fn cli() -> Command {
.arg(pg_version_arg.clone()) .arg(pg_version_arg.clone())
.arg(hot_standby_arg.clone()) .arg(hot_standby_arg.clone())
.arg(update_catalog) .arg(update_catalog)
.arg(allow_multiple.clone())
) )
.subcommand(Command::new("start") .subcommand(Command::new("start")
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1576,7 +1581,6 @@ fn cli() -> Command {
.arg(safekeepers_arg) .arg(safekeepers_arg)
.arg(remote_ext_config_args) .arg(remote_ext_config_args)
.arg(create_test_user) .arg(create_test_user)
.arg(allow_multiple.clone())
) )
.subcommand(Command::new("reconfigure") .subcommand(Command::new("reconfigure")
.about("Reconfigure the endpoint") .about("Reconfigure the endpoint")
@@ -1628,6 +1632,7 @@ fn cli() -> Command {
.subcommand( .subcommand(
Command::new("start") Command::new("start")
.about("Start page server and safekeepers") .about("Start page server and safekeepers")
.arg(pageserver_config_args)
) )
.subcommand( .subcommand(
Command::new("stop") Command::new("stop")

View File

@@ -554,7 +554,6 @@ impl Endpoint {
format_version: 1.0, format_version: 1.0,
operation_uuid: None, operation_uuid: None,
features: self.features.clone(), features: self.features.clone(),
swap_size_bytes: None,
cluster: Cluster { cluster: Cluster {
cluster_id: None, // project ID: not used cluster_id: None, // project ID: not used
name: None, // project name: not used name: None, // project name: not used

View File

@@ -3,7 +3,7 @@
//! Now it also provides init method which acts like a stub for proper installation //! Now it also provides init method which acts like a stub for proper installation
//! script which will use local paths. //! script which will use local paths.
use anyhow::{bail, Context}; use anyhow::{bail, ensure, Context};
use clap::ValueEnum; use clap::ValueEnum;
use postgres_backend::AuthType; use postgres_backend::AuthType;
@@ -17,14 +17,11 @@ use std::net::Ipv4Addr;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::process::{Command, Stdio}; use std::process::{Command, Stdio};
use std::time::Duration;
use utils::{ use utils::{
auth::{encode_from_key_file, Claims}, auth::{encode_from_key_file, Claims},
id::{NodeId, TenantId, TenantTimelineId, TimelineId}, id::{NodeId, TenantId, TenantTimelineId, TimelineId},
}; };
use crate::pageserver::PageServerNode;
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
use crate::safekeeper::SafekeeperNode; use crate::safekeeper::SafekeeperNode;
pub const DEFAULT_PG_VERSION: u32 = 15; pub const DEFAULT_PG_VERSION: u32 = 15;
@@ -36,7 +33,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
// to 'neon_local init --config=<path>' option. See control_plane/simple.conf for // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
// an example. // an example.
// //
#[derive(PartialEq, Eq, Clone, Debug)] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct LocalEnv { pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and // Base directory for all the nodes (the pageserver, safekeepers and
// compute endpoints). // compute endpoints).
@@ -44,99 +41,55 @@ pub struct LocalEnv {
// This is not stored in the config file. Rather, this is the path where the // This is not stored in the config file. Rather, this is the path where the
// config file itself is. It is read from the NEON_REPO_DIR env variable or // config file itself is. It is read from the NEON_REPO_DIR env variable or
// '.neon' if not given. // '.neon' if not given.
#[serde(skip)]
pub base_data_dir: PathBuf, pub base_data_dir: PathBuf,
// Path to postgres distribution. It's expected that "bin", "include", // Path to postgres distribution. It's expected that "bin", "include",
// "lib", "share" from postgres distribution are there. If at some point // "lib", "share" from postgres distribution are there. If at some point
// in time we will be able to run against vanilla postgres we may split that // in time we will be able to run against vanilla postgres we may split that
// to four separate paths and match OS-specific installation layout. // to four separate paths and match OS-specific installation layout.
#[serde(default)]
pub pg_distrib_dir: PathBuf, pub pg_distrib_dir: PathBuf,
// Path to pageserver binary. // Path to pageserver binary.
#[serde(default)]
pub neon_distrib_dir: PathBuf, pub neon_distrib_dir: PathBuf,
// Default tenant ID to use with the 'neon_local' command line utility, when // Default tenant ID to use with the 'neon_local' command line utility, when
// --tenant_id is not explicitly specified. // --tenant_id is not explicitly specified.
#[serde(default)]
pub default_tenant_id: Option<TenantId>, pub default_tenant_id: Option<TenantId>,
// used to issue tokens during e.g pg start // used to issue tokens during e.g pg start
#[serde(default)]
pub private_key_path: PathBuf, pub private_key_path: PathBuf,
pub broker: NeonBroker, pub broker: NeonBroker,
// Configuration for the storage controller (1 per neon_local environment)
pub storage_controller: NeonStorageControllerConf,
/// This Vec must always contain at least one pageserver /// This Vec must always contain at least one pageserver
/// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s.
/// NB: not used anymore except for informing users that they need to change their `.neon/config`.
pub pageservers: Vec<PageServerConf>, pub pageservers: Vec<PageServerConf>,
#[serde(default)]
pub safekeepers: Vec<SafekeeperConf>, pub safekeepers: Vec<SafekeeperConf>,
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
// be propagated into each pageserver's configuration. // be propagated into each pageserver's configuration.
#[serde(default)]
pub control_plane_api: Option<Url>, pub control_plane_api: Option<Url>,
// Control plane upcall API for storage controller. If set, this will be propagated into the // Control plane upcall API for storage controller. If set, this will be propagated into the
// storage controller's configuration. // storage controller's configuration.
#[serde(default)]
pub control_plane_compute_hook_api: Option<Url>, pub control_plane_compute_hook_api: Option<Url>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
#[serde(default)]
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here, // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
}
/// On-disk state stored in `.neon/config`.
#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
#[serde(default, deny_unknown_fields)]
pub struct OnDiskConfig {
pub pg_distrib_dir: PathBuf,
pub neon_distrib_dir: PathBuf,
pub default_tenant_id: Option<TenantId>,
pub private_key_path: PathBuf,
pub broker: NeonBroker,
pub storage_controller: NeonStorageControllerConf,
#[serde(
skip_serializing,
deserialize_with = "fail_if_pageservers_field_specified"
)]
pub pageservers: Vec<PageServerConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>, branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
} }
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
where
D: serde::Deserializer<'de>,
{
Err(serde::de::Error::custom(
"The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \
Please remove the `pageservers` from your .neon/config.",
))
}
/// The description of the neon_local env to be initialized by `neon_local init --config`.
#[derive(Clone, Debug, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct NeonLocalInitConf {
// TODO: do we need this? Seems unused
pub pg_distrib_dir: Option<PathBuf>,
// TODO: do we need this? Seems unused
pub neon_distrib_dir: Option<PathBuf>,
pub default_tenant_id: TenantId,
pub broker: NeonBroker,
pub storage_controller: Option<NeonStorageControllerConf>,
pub pageservers: Vec<NeonLocalInitPageserverConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Option<Url>>,
pub control_plane_compute_hook_api: Option<Option<Url>>,
}
/// Broker config for cluster internal communication. /// Broker config for cluster internal communication.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)] #[serde(default)]
@@ -145,33 +98,6 @@ pub struct NeonBroker {
pub listen_addr: SocketAddr, pub listen_addr: SocketAddr,
} }
/// Broker config for cluster internal communication.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct NeonStorageControllerConf {
/// Heartbeat timeout before marking a node offline
#[serde(with = "humantime_serde")]
pub max_unavailable: Duration,
/// Threshold for auto-splitting a tenant into shards
pub split_threshold: Option<u64>,
}
impl NeonStorageControllerConf {
// Use a shorter pageserver unavailability interval than the default to speed up tests.
const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
std::time::Duration::from_secs(10);
}
impl Default for NeonStorageControllerConf {
fn default() -> Self {
Self {
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
split_threshold: None,
}
}
}
// Dummy Default impl to satisfy Deserialize derive. // Dummy Default impl to satisfy Deserialize derive.
impl Default for NeonBroker { impl Default for NeonBroker {
fn default() -> Self { fn default() -> Self {
@@ -187,18 +113,22 @@ impl NeonBroker {
} }
} }
// neon_local needs to know this subset of pageserver configuration.
// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`.
// It can get stale if `pageserver.toml` is changed.
// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml`
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default, deny_unknown_fields)] #[serde(default, deny_unknown_fields)]
pub struct PageServerConf { pub struct PageServerConf {
// node id
pub id: NodeId, pub id: NodeId,
// Pageserver connection settings
pub listen_pg_addr: String, pub listen_pg_addr: String,
pub listen_http_addr: String, pub listen_http_addr: String,
// auth type used for the PG and HTTP ports
pub pg_auth_type: AuthType, pub pg_auth_type: AuthType,
pub http_auth_type: AuthType, pub http_auth_type: AuthType,
pub(crate) virtual_file_io_engine: Option<String>,
pub(crate) get_vectored_impl: Option<String>,
} }
impl Default for PageServerConf { impl Default for PageServerConf {
@@ -209,40 +139,8 @@ impl Default for PageServerConf {
listen_http_addr: String::new(), listen_http_addr: String::new(),
pg_auth_type: AuthType::Trust, pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust,
} virtual_file_io_engine: None,
} get_vectored_impl: None,
}
/// The toml that can be passed to `neon_local init --config`.
/// This is a subset of the `pageserver.toml` configuration.
// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
pub struct NeonLocalInitPageserverConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
#[serde(flatten)]
pub other: HashMap<String, toml::Value>,
}
impl From<&NeonLocalInitPageserverConf> for PageServerConf {
fn from(conf: &NeonLocalInitPageserverConf) -> Self {
let NeonLocalInitPageserverConf {
id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
other: _,
} = conf;
Self {
id: *id,
listen_pg_addr: listen_pg_addr.clone(),
listen_http_addr: listen_http_addr.clone(),
pg_auth_type: *pg_auth_type,
http_auth_type: *http_auth_type,
} }
} }
} }
@@ -258,7 +156,6 @@ pub struct SafekeeperConf {
pub remote_storage: Option<String>, pub remote_storage: Option<String>,
pub backup_threads: Option<u32>, pub backup_threads: Option<u32>,
pub auth_enabled: bool, pub auth_enabled: bool,
pub listen_addr: Option<String>,
} }
impl Default for SafekeeperConf { impl Default for SafekeeperConf {
@@ -272,7 +169,6 @@ impl Default for SafekeeperConf {
remote_storage: None, remote_storage: None,
backup_threads: None, backup_threads: None,
auth_enabled: false, auth_enabled: false,
listen_addr: None,
} }
} }
} }
@@ -430,7 +326,41 @@ impl LocalEnv {
.collect() .collect()
} }
/// Construct `Self` from on-disk state. /// Create a LocalEnv from a config file.
///
/// Unlike 'load_config', this function fills in any defaults that are missing
/// from the config file.
pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
let mut env: LocalEnv = toml::from_str(toml)?;
// Find postgres binaries.
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
// Note that later in the code we assume, that distrib dirs follow the same pattern
// for all postgres versions.
if env.pg_distrib_dir == Path::new("") {
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
env.pg_distrib_dir = postgres_bin.into();
} else {
let cwd = env::current_dir()?;
env.pg_distrib_dir = cwd.join("pg_install")
}
}
// Find neon binaries.
if env.neon_distrib_dir == Path::new("") {
env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
}
if env.pageservers.is_empty() {
anyhow::bail!("Configuration must contain at least one pageserver");
}
env.base_data_dir = base_path();
Ok(env)
}
/// Locate and load config
pub fn load_config() -> anyhow::Result<Self> { pub fn load_config() -> anyhow::Result<Self> {
let repopath = base_path(); let repopath = base_path();
@@ -444,129 +374,38 @@ impl LocalEnv {
// TODO: check that it looks like a neon repository // TODO: check that it looks like a neon repository
// load and parse file // load and parse file
let config_file_contents = fs::read_to_string(repopath.join("config"))?; let config = fs::read_to_string(repopath.join("config"))?;
let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?; let mut env: LocalEnv = toml::from_str(config.as_str())?;
let mut env = {
let OnDiskConfig {
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id,
private_key_path,
broker,
storage_controller,
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
branch_name_mappings,
} = on_disk_config;
LocalEnv {
base_data_dir: repopath.clone(),
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id,
private_key_path,
broker,
storage_controller,
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
branch_name_mappings,
}
};
// The source of truth for pageserver configuration is the pageserver.toml. env.base_data_dir = repopath;
assert!(
env.pageservers.is_empty(),
"we ensure this during deserialization"
);
env.pageservers = {
let iter = std::fs::read_dir(&repopath).context("open dir")?;
let mut pageservers = Vec::new();
for res in iter {
let dentry = res?;
const PREFIX: &str = "pageserver_";
let dentry_name = dentry
.file_name()
.into_string()
.ok()
.with_context(|| format!("non-utf8 dentry: {:?}", dentry.path()))
.unwrap();
if !dentry_name.starts_with(PREFIX) {
continue;
}
if !dentry.file_type().context("determine file type")?.is_dir() {
anyhow::bail!("expected a directory, got {:?}", dentry.path());
}
let id = dentry_name[PREFIX.len()..]
.parse::<NodeId>()
.with_context(|| format!("parse id from {:?}", dentry.path()))?;
// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
#[derive(serde::Serialize, serde::Deserialize)]
// (allow unknown fields, unlike PageServerConf)
struct PageserverConfigTomlSubset {
id: NodeId,
listen_pg_addr: String,
listen_http_addr: String,
pg_auth_type: AuthType,
http_auth_type: AuthType,
}
let config_toml_path = dentry.path().join("pageserver.toml");
let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
&std::fs::read_to_string(&config_toml_path)
.with_context(|| format!("read {:?}", config_toml_path))?,
)
.context("parse pageserver.toml")?;
let PageserverConfigTomlSubset {
id: config_toml_id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
} = config_toml;
let conf = PageServerConf {
id: {
anyhow::ensure!(
config_toml_id == id,
"id mismatch: config_toml.id={config_toml_id} id={id}",
);
id
},
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
};
pageservers.push(conf);
}
pageservers
};
Ok(env) Ok(env)
} }
pub fn persist_config(&self) -> anyhow::Result<()> { pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
Self::persist_config_impl( // Currently, the user first passes a config file with 'neon_local init --config=<path>'
&self.base_data_dir, // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
&OnDiskConfig { // to .neon/config. TODO: We lose any formatting and comments along the way, which is
pg_distrib_dir: self.pg_distrib_dir.clone(), // a bit sad.
neon_distrib_dir: self.neon_distrib_dir.clone(), let mut conf_content = r#"# This file describes a local deployment of the page server
default_tenant_id: self.default_tenant_id, # and safekeeeper node. It is read by the 'neon_local' command-line
private_key_path: self.private_key_path.clone(), # utility.
broker: self.broker.clone(), "#
storage_controller: self.storage_controller.clone(), .to_string();
pageservers: vec![], // it's skip_serializing anyway
safekeepers: self.safekeepers.clone(), // Convert the LocalEnv to a toml file.
control_plane_api: self.control_plane_api.clone(), //
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), // This could be as simple as this:
branch_name_mappings: self.branch_name_mappings.clone(), //
}, // conf_content += &toml::to_string_pretty(env)?;
) //
} // But it results in a "values must be emitted before tables". I'm not sure
// why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
// Maybe rust reorders the fields to squeeze avoid padding or something?
// In any case, converting to toml::Value first, and serializing that, works.
// See https://github.com/alexcrichton/toml-rs/issues/142
conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> {
let conf_content = &toml::to_string_pretty(config)?;
let target_config_path = base_path.join("config"); let target_config_path = base_path.join("config");
fs::write(&target_config_path, conf_content).with_context(|| { fs::write(&target_config_path, conf_content).with_context(|| {
format!( format!(
@@ -591,13 +430,17 @@ impl LocalEnv {
} }
} }
/// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`]. //
pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> { // Initialize a new Neon repository
let base_path = base_path(); //
assert_ne!(base_path, Path::new("")); pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
let base_path = &base_path; // check if config already exists
let base_path = &self.base_data_dir;
ensure!(
base_path != Path::new(""),
"repository base path is missing"
);
// create base_path dir
if base_path.exists() { if base_path.exists() {
match force { match force {
InitForceMode::MustNotExist => { InitForceMode::MustNotExist => {
@@ -629,96 +472,70 @@ impl LocalEnv {
} }
} }
} }
if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
bail!(
"Can't find postgres binary at {}",
self.pg_bin_dir(pg_version)?.display()
);
}
for binary in ["pageserver", "safekeeper"] {
if !self.neon_distrib_dir.join(binary).exists() {
bail!(
"Can't find binary '{binary}' in neon distrib dir '{}'",
self.neon_distrib_dir.display()
);
}
}
if !base_path.exists() { if !base_path.exists() {
fs::create_dir(base_path)?; fs::create_dir(base_path)?;
} }
let NeonLocalInitConf {
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id,
broker,
storage_controller,
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
} = conf;
// Find postgres binaries.
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
// Note that later in the code we assume, that distrib dirs follow the same pattern
// for all postgres versions.
let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| {
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
postgres_bin.into()
} else {
let cwd = env::current_dir().unwrap();
cwd.join("pg_install")
}
});
// Find neon binaries.
let neon_distrib_dir = neon_distrib_dir
.unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned());
// Generate keypair for JWT. // Generate keypair for JWT.
// //
// The keypair is only needed if authentication is enabled in any of the // The keypair is only needed if authentication is enabled in any of the
// components. For convenience, we generate the keypair even if authentication // components. For convenience, we generate the keypair even if authentication
// is not enabled, so that you can easily enable it after the initialization // is not enabled, so that you can easily enable it after the initialization
// step. // step. However, if the key generation fails, we treat it as non-fatal if
generate_auth_keys( // authentication was not enabled.
base_path.join("auth_private_key.pem").as_path(), if self.private_key_path == PathBuf::new() {
base_path.join("auth_public_key.pem").as_path(), match generate_auth_keys(
) base_path.join("auth_private_key.pem").as_path(),
.context("generate auth keys")?; base_path.join("auth_public_key.pem").as_path(),
let private_key_path = PathBuf::from("auth_private_key.pem"); ) {
Ok(()) => {
// create the runtime type because the remaining initialization code below needs self.private_key_path = PathBuf::from("auth_private_key.pem");
// a LocalEnv instance op operation }
// TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state Err(e) => {
let env = LocalEnv { if !self.auth_keys_needed() {
base_data_dir: base_path.clone(), eprintln!("Could not generate keypair for JWT authentication: {e}");
pg_distrib_dir, eprintln!("Continuing anyway because authentication was not enabled");
neon_distrib_dir, self.private_key_path = PathBuf::from("auth_private_key.pem");
default_tenant_id: Some(default_tenant_id), } else {
private_key_path, return Err(e);
broker, }
storage_controller: storage_controller.unwrap_or_default(), }
pageservers: pageservers.iter().map(Into::into).collect(), }
safekeepers,
control_plane_api: control_plane_api.unwrap_or_default(),
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
branch_name_mappings: Default::default(),
};
// create endpoints dir
fs::create_dir_all(env.endpoints_path())?;
// create safekeeper dirs
for safekeeper in &env.safekeepers {
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
} }
// initialize pageserver state fs::create_dir_all(self.endpoints_path())?;
for (i, ps) in pageservers.into_iter().enumerate() {
let runtime_ps = &env.pageservers[i]; for safekeeper in &self.safekeepers {
assert_eq!(&PageServerConf::from(&ps), runtime_ps); fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
fs::create_dir(env.pageserver_data_dir(ps.id))?;
PageServerNode::from_env(&env, runtime_ps)
.initialize(ps)
.context("pageserver init failed")?;
} }
// setup remote remote location for default LocalFs remote storage self.persist_config(base_path)
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; }
env.persist_config() fn auth_keys_needed(&self) -> bool {
self.pageservers.iter().any(|ps| {
ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
}) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
} }
} }
pub fn base_path() -> PathBuf { fn base_path() -> PathBuf {
match std::env::var_os("NEON_REPO_DIR") { match std::env::var_os("NEON_REPO_DIR") {
Some(val) => PathBuf::from(val), Some(val) => PathBuf::from(val),
None => PathBuf::from(".neon"), None => PathBuf::from(".neon"),
@@ -761,3 +578,31 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
} }
Ok(()) Ok(())
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn simple_conf_parsing() {
let simple_conf_toml = include_str!("../simple.conf");
let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
assert!(
simple_conf_parse_result.is_ok(),
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
);
let string_to_replace = "listen_addr = '127.0.0.1:50051'";
let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
assert!(
spoiled_url_toml.contains(spoiled_url_str),
"Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
);
let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
assert!(
spoiled_url_parse_result.is_err(),
"expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
);
}
}

View File

@@ -4,21 +4,21 @@
//! //!
//! .neon/ //! .neon/
//! //!
use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::io; use std::io;
use std::io::Write; use std::io::Write;
use std::num::NonZeroU64; use std::num::NonZeroU64;
use std::path::PathBuf; use std::path::PathBuf;
use std::str::FromStr; use std::process::Command;
use std::time::Duration; use std::time::Duration;
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use camino::Utf8PathBuf; use camino::Utf8PathBuf;
use futures::SinkExt; use futures::SinkExt;
use pageserver_api::models::{ use pageserver_api::models::{
self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
TimelineInfo,
}; };
use pageserver_api::shard::TenantShardId; use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api; use pageserver_client::mgmt_api;
@@ -30,7 +30,7 @@ use utils::{
lsn::Lsn, lsn::Lsn,
}; };
use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf}; use crate::local_env::PageServerConf;
use crate::{background_process, local_env::LocalEnv}; use crate::{background_process, local_env::LocalEnv};
/// Directory within .neon which will be used by default for LocalFs remote storage. /// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -74,23 +74,57 @@ impl PageServerNode {
} }
} }
fn pageserver_init_make_toml( /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
&self, ///
conf: NeonLocalInitPageserverConf, /// These all end up on the command line of the `pageserver` binary.
) -> anyhow::Result<toml_edit::Document> { fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
let pg_distrib_dir_param = format!( let pg_distrib_dir_param = format!(
"pg_distrib_dir='{}'", "pg_distrib_dir='{}'",
self.env.pg_distrib_dir_raw().display() self.env.pg_distrib_dir_raw().display()
); );
let PageServerConf {
id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
virtual_file_io_engine,
get_vectored_impl,
} = &self.conf;
let id = format!("id={}", id);
let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
format!("virtual_file_io_engine='{virtual_file_io_engine}'")
} else {
String::new()
};
let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
format!("get_vectored_impl='{get_vectored_impl}'")
} else {
String::new()
};
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; let mut overrides = vec![
id,
pg_distrib_dir_param,
http_auth_type_param,
pg_auth_type_param,
listen_http_addr_param,
listen_pg_addr_param,
broker_endpoint_param,
virtual_file_io_engine,
get_vectored_impl,
];
if let Some(control_plane_api) = &self.env.control_plane_api { if let Some(control_plane_api) = &self.env.control_plane_api {
overrides.push(format!( overrides.push(format!(
@@ -100,7 +134,7 @@ impl PageServerNode {
// Storage controller uses the same auth as pageserver: if JWT is enabled // Storage controller uses the same auth as pageserver: if JWT is enabled
// for us, we will also need it to talk to them. // for us, we will also need it to talk to them.
if matches!(conf.http_auth_type, AuthType::NeonJWT) { if matches!(http_auth_type, AuthType::NeonJWT) {
let jwt_token = self let jwt_token = self
.env .env
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -109,40 +143,31 @@ impl PageServerNode {
} }
} }
if !conf.other.contains_key("remote_storage") { if !cli_overrides
.iter()
.any(|c| c.starts_with("remote_storage"))
{
overrides.push(format!( overrides.push(format!(
"remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}" "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
)); ));
} }
if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust { if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
// Keys are generated in the toplevel repo dir, pageservers' workdirs // Keys are generated in the toplevel repo dir, pageservers' workdirs
// are one level below that, so refer to keys with ../ // are one level below that, so refer to keys with ../
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
} }
// Apply the user-provided overrides // Apply the user-provided overrides
overrides.push( overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
toml_edit::ser::to_string_pretty(&conf)
.expect("we deserialized this from toml earlier"),
);
// Turn `overrides` into a toml document. overrides
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
let mut config_toml = toml_edit::Document::new();
for fragment_str in overrides {
let fragment = toml_edit::Document::from_str(&fragment_str)
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
for (key, item) in fragment.iter() {
config_toml.insert(key, item.clone());
}
}
Ok(config_toml)
} }
/// Initializes a pageserver node by creating its config with the overrides provided. /// Initializes a pageserver node by creating its config with the overrides provided.
pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
self.pageserver_init(conf) // First, run `pageserver --init` and wait for it to write a config into FS and exit.
self.pageserver_init(config_overrides)
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id)) .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
} }
@@ -158,11 +183,11 @@ impl PageServerNode {
.expect("non-Unicode path") .expect("non-Unicode path")
} }
pub async fn start(&self) -> anyhow::Result<()> { pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
self.start_node().await self.start_node(config_overrides, false).await
} }
fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
let datadir = self.repo_path(); let datadir = self.repo_path();
let node_id = self.conf.id; let node_id = self.conf.id;
println!( println!(
@@ -173,20 +198,29 @@ impl PageServerNode {
); );
io::stdout().flush()?; io::stdout().flush()?;
let config = self if !datadir.exists() {
.pageserver_init_make_toml(conf) std::fs::create_dir(&datadir)?;
.context("make pageserver toml")?; }
let config_file_path = datadir.join("pageserver.toml");
let mut config_file = std::fs::OpenOptions::new() let datadir_path_str = datadir.to_str().with_context(|| {
.create_new(true) format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
.write(true) })?;
.open(&config_file_path) let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
.with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?; args.push(Cow::Borrowed("--init"));
config_file
.write_all(config.to_string().as_bytes()) let init_output = Command::new(self.env.pageserver_bin())
.context("write pageserver toml")?; .args(args.iter().map(Cow::as_ref))
drop(config_file); .envs(self.pageserver_env_variables()?)
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config .output()
.with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
anyhow::ensure!(
init_output.status.success(),
"Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
node_id,
String::from_utf8_lossy(&init_output.stdout),
String::from_utf8_lossy(&init_output.stderr),
);
// Write metadata file, used by pageserver on startup to register itself with // Write metadata file, used by pageserver on startup to register itself with
// the storage controller // the storage controller
@@ -200,13 +234,12 @@ impl PageServerNode {
// situation: the metadata is written by some other script. // situation: the metadata is written by some other script.
std::fs::write( std::fs::write(
metadata_path, metadata_path,
serde_json::to_vec(&pageserver_api::config::NodeMetadata { serde_json::to_vec(&serde_json::json!({
postgres_host: "localhost".to_string(), "host": "localhost",
postgres_port: self.pg_connection_config.port(), "port": self.pg_connection_config.port(),
http_host: "localhost".to_string(), "http_host": "localhost",
http_port, "http_port": http_port,
other: HashMap::new(), }))
})
.unwrap(), .unwrap(),
) )
.expect("Failed to write metadata file"); .expect("Failed to write metadata file");
@@ -214,7 +247,11 @@ impl PageServerNode {
Ok(()) Ok(())
} }
async fn start_node(&self) -> anyhow::Result<()> { async fn start_node(
&self,
config_overrides: &[&str],
update_config: bool,
) -> anyhow::Result<()> {
// TODO: using a thread here because start_process() is not async but we need to call check_status() // TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path(); let datadir = self.repo_path();
print!( print!(
@@ -231,12 +268,15 @@ impl PageServerNode {
self.conf.id, datadir, self.conf.id, datadir,
) )
})?; })?;
let args = vec!["-D", datadir_path_str]; let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
if update_config {
args.push(Cow::Borrowed("--update-config"));
}
background_process::start_process( background_process::start_process(
"pageserver", "pageserver",
&datadir, &datadir,
&self.env.pageserver_bin(), &self.env.pageserver_bin(),
args, args.iter().map(Cow::as_ref),
self.pageserver_env_variables()?, self.pageserver_env_variables()?,
background_process::InitialPidFile::Expect(self.pid_file()), background_process::InitialPidFile::Expect(self.pid_file()),
|| async { || async {
@@ -253,6 +293,22 @@ impl PageServerNode {
Ok(()) Ok(())
} }
fn pageserver_basic_args<'a>(
&self,
config_overrides: &'a [&'a str],
datadir_path_str: &'a str,
) -> Vec<Cow<'a, str>> {
let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
let overrides = self.neon_local_overrides(config_overrides);
for config_override in overrides {
args.push(Cow::Borrowed("-c"));
args.push(Cow::Owned(config_override));
}
args
}
fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> { fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
// FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
// needs a token, and how to generate that token, seems independent to whether // needs a token, and how to generate that token, seems independent to whether
@@ -333,10 +389,6 @@ impl PageServerNode {
.remove("image_creation_threshold") .remove("image_creation_threshold")
.map(|x| x.parse::<usize>()) .map(|x| x.parse::<usize>())
.transpose()?, .transpose()?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout") .remove("walreceiver_connect_timeout")
@@ -378,11 +430,6 @@ impl PageServerNode {
.map(serde_json::from_str) .map(serde_json::from_str)
.transpose() .transpose()
.context("parse `timeline_get_throttle` from json")?, .context("parse `timeline_get_throttle` from json")?,
switch_aux_file_policy: settings
.remove("switch_aux_file_policy")
.map(|x| x.parse::<AuxFilePolicy>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
}; };
if !settings.is_empty() { if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}") bail!("Unrecognized tenant settings: {settings:?}")
@@ -454,12 +501,6 @@ impl PageServerNode {
.map(|x| x.parse::<usize>()) .map(|x| x.parse::<usize>())
.transpose() .transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?, .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout") .remove("walreceiver_connect_timeout")
@@ -501,11 +542,6 @@ impl PageServerNode {
.map(serde_json::from_str) .map(serde_json::from_str)
.transpose() .transpose()
.context("parse `timeline_get_throttle` from json")?, .context("parse `timeline_get_throttle` from json")?,
switch_aux_file_policy: settings
.remove("switch_aux_file_policy")
.map(|x| x.parse::<AuxFilePolicy>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
} }
}; };

View File

@@ -70,31 +70,24 @@ pub struct SafekeeperNode {
pub pg_connection_config: PgConnectionConfig, pub pg_connection_config: PgConnectionConfig,
pub env: LocalEnv, pub env: LocalEnv,
pub http_client: reqwest::Client, pub http_client: reqwest::Client,
pub listen_addr: String,
pub http_base_url: String, pub http_base_url: String,
} }
impl SafekeeperNode { impl SafekeeperNode {
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
listen_addr.clone()
} else {
"127.0.0.1".to_string()
};
SafekeeperNode { SafekeeperNode {
id: conf.id, id: conf.id,
conf: conf.clone(), conf: conf.clone(),
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port), pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
env: env.clone(), env: env.clone(),
http_client: reqwest::Client::new(), http_client: reqwest::Client::new(),
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port), http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
listen_addr,
} }
} }
/// Construct libpq connection string for connecting to this safekeeper. /// Construct libpq connection string for connecting to this safekeeper.
fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig { fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port) PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
} }
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -118,8 +111,8 @@ impl SafekeeperNode {
); );
io::stdout().flush().unwrap(); io::stdout().flush().unwrap();
let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port); let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port); let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
let id = self.id; let id = self.id;
let datadir = self.datadir_path(); let datadir = self.datadir_path();
@@ -146,7 +139,7 @@ impl SafekeeperNode {
availability_zone, availability_zone,
]; ];
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port { if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port); let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]); args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
} }
if !self.conf.sync { if !self.conf.sync {

View File

@@ -1,8 +1,6 @@
use crate::{ use crate::{background_process, local_env::LocalEnv};
background_process,
local_env::{LocalEnv, NeonStorageControllerConf},
};
use camino::{Utf8Path, Utf8PathBuf}; use camino::{Utf8Path, Utf8PathBuf};
use hyper::Method;
use pageserver_api::{ use pageserver_api::{
controller_api::{ controller_api::{
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse, NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
@@ -16,7 +14,6 @@ use pageserver_api::{
}; };
use pageserver_client::mgmt_api::ResponseErrorMessageExt; use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType; use postgres_backend::AuthType;
use reqwest::Method;
use serde::{de::DeserializeOwned, Deserialize, Serialize}; use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{fs, str::FromStr}; use std::{fs, str::FromStr};
use tokio::process::Command; use tokio::process::Command;
@@ -35,13 +32,15 @@ pub struct StorageController {
public_key: Option<String>, public_key: Option<String>,
postgres_port: u16, postgres_port: u16,
client: reqwest::Client, client: reqwest::Client,
config: NeonStorageControllerConf,
} }
const COMMAND: &str = "storage_controller"; const COMMAND: &str = "storage_controller";
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
// Use a shorter pageserver unavailability interval than the default to speed up tests.
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct AttachHookRequest { pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId, pub tenant_shard_id: TenantShardId,
@@ -136,7 +135,6 @@ impl StorageController {
client: reqwest::ClientBuilder::new() client: reqwest::ClientBuilder::new()
.build() .build()
.expect("Failed to construct http client"), .expect("Failed to construct http client"),
config: env.storage_controller.clone(),
} }
} }
@@ -274,6 +272,8 @@ impl StorageController {
// Run migrations on every startup, in case something changed. // Run migrations on every startup, in case something changed.
let database_url = self.setup_database().await?; let database_url = self.setup_database().await?;
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
let mut args = vec![ let mut args = vec![
"-l", "-l",
&self.listen, &self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
"--database-url", "--database-url",
&database_url, &database_url,
"--max-unavailable-interval", "--max-unavailable-interval",
&humantime::Duration::from(self.config.max_unavailable).to_string(), &max_unavailable.to_string(),
] ]
.into_iter() .into_iter()
.map(|s| s.to_string()) .map(|s| s.to_string())
@@ -305,10 +305,6 @@ impl StorageController {
)); ));
} }
if let Some(split_threshold) = self.config.split_threshold.as_ref() {
args.push(format!("--split-threshold={split_threshold}"))
}
background_process::start_process( background_process::start_process(
COMMAND, COMMAND,
&self.env.base_data_dir, &self.env.base_data_dir,
@@ -383,7 +379,7 @@ impl StorageController {
/// Simple HTTP request wrapper for calling into storage controller /// Simple HTTP request wrapper for calling into storage controller
async fn dispatch<RQ, RS>( async fn dispatch<RQ, RS>(
&self, &self,
method: reqwest::Method, method: hyper::Method,
path: String, path: String,
body: Option<RQ>, body: Option<RQ>,
) -> anyhow::Result<RS> ) -> anyhow::Result<RS>
@@ -476,16 +472,6 @@ impl StorageController {
.await .await
} }
#[instrument(skip(self))]
pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
self.dispatch::<(), TenantCreateResponse>(
Method::POST,
format!("debug/v1/tenant/{tenant_id}/import"),
None,
)
.await
}
#[instrument(skip(self))] #[instrument(skip(self))]
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> { pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
self.dispatch::<(), _>( self.dispatch::<(), _>(

View File

@@ -1,23 +0,0 @@
[package]
name = "storcon_cli"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
anyhow.workspace = true
clap.workspace = true
comfy-table.workspace = true
hyper.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
thiserror.workspace = true
tokio.workspace = true
tracing.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -1,680 +0,0 @@
use std::{collections::HashMap, str::FromStr, time::Duration};
use clap::{Parser, Subcommand};
use pageserver_api::{
controller_api::{
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
TenantDescribeResponse, TenantPolicyRequest,
},
models::{
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
},
shard::{ShardStripeSize, TenantShardId},
};
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
use reqwest::{Method, StatusCode, Url};
use serde::{de::DeserializeOwned, Serialize};
use utils::id::{NodeId, TenantId};
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
};
#[derive(Subcommand, Debug)]
enum Command {
/// Register a pageserver with the storage controller. This shouldn't usually be necessary,
/// since pageservers auto-register when they start up
NodeRegister {
#[arg(long)]
node_id: NodeId,
#[arg(long)]
listen_pg_addr: String,
#[arg(long)]
listen_pg_port: u16,
#[arg(long)]
listen_http_addr: String,
#[arg(long)]
listen_http_port: u16,
},
/// Modify a node's configuration in the storage controller
NodeConfigure {
#[arg(long)]
node_id: NodeId,
/// Availability is usually auto-detected based on heartbeats. Set 'offline' here to
/// manually mark a node offline
#[arg(long)]
availability: Option<NodeAvailabilityArg>,
/// Scheduling policy controls whether tenant shards may be scheduled onto this node.
#[arg(long)]
scheduling: Option<NodeSchedulingPolicy>,
},
/// Modify a tenant's policies in the storage controller
TenantPolicy {
#[arg(long)]
tenant_id: TenantId,
/// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
/// or is in the normal attached state with N secondary locations (`attached:N`)
#[arg(long)]
placement: Option<PlacementPolicyArg>,
/// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal,
/// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
/// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant
/// unavailable, and are only for use in emergencies.
#[arg(long)]
scheduling: Option<ShardSchedulingPolicyArg>,
},
/// List nodes known to the storage controller
Nodes {},
/// List tenants known to the storage controller
Tenants {},
/// Create a new tenant in the storage controller, and by extension on pageservers.
TenantCreate {
#[arg(long)]
tenant_id: TenantId,
},
/// Delete a tenant in the storage controller, and by extension on pageservers.
TenantDelete {
#[arg(long)]
tenant_id: TenantId,
},
/// Split an existing tenant into a higher number of shards than its current shard count.
TenantShardSplit {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
shard_count: u8,
/// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes.
#[arg(long)]
stripe_size: Option<u32>,
},
/// Migrate the attached location for a tenant shard to a specific pageserver.
TenantShardMigrate {
#[arg(long)]
tenant_shard_id: TenantShardId,
#[arg(long)]
node: NodeId,
},
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
/// that is passed through to pageservers, and does not affect storage controller behavior.
TenantConfig {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
config: String,
},
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
/// alternative to the storage controller's scheduling optimization behavior.
TenantScatter {
#[arg(long)]
tenant_id: TenantId,
},
/// Print details about a particular tenant, including all its shards' states.
TenantDescribe {
#[arg(long)]
tenant_id: TenantId,
},
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
/// mode so that it can warm up content on a pageserver.
TenantWarmup {
#[arg(long)]
tenant_id: TenantId,
},
}
#[derive(Parser)]
#[command(
author,
version,
about,
long_about = "CLI for Storage Controller Support/Debug"
)]
#[command(arg_required_else_help(true))]
struct Cli {
#[arg(long)]
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
api: Url,
#[arg(long)]
/// JWT token for authenticating with storage controller. Depending on the API used, this
/// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
/// a token with both scopes to use with this tool.
jwt: Option<String>,
#[command(subcommand)]
command: Command,
}
#[derive(Debug, Clone)]
struct PlacementPolicyArg(PlacementPolicy);
impl FromStr for PlacementPolicyArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"detached" => Ok(Self(PlacementPolicy::Detached)),
"secondary" => Ok(Self(PlacementPolicy::Secondary)),
_ if s.starts_with("attached:") => {
let mut splitter = s.split(':');
let _prefix = splitter.next().unwrap();
match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
None => Err(anyhow::anyhow!(
"Invalid format '{s}', a valid example is 'attached:1'"
)),
}
}
_ => Err(anyhow::anyhow!(
"Unknown placement policy '{s}', try detached,secondary,attached:<n>"
)),
}
}
}
#[derive(Debug, Clone)]
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
impl FromStr for ShardSchedulingPolicyArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self(ShardSchedulingPolicy::Active)),
"essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
"pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
"stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
_ => Err(anyhow::anyhow!(
"Unknown scheduling policy '{s}', try active,essential,pause,stop"
)),
}
}
}
#[derive(Debug, Clone)]
struct NodeAvailabilityArg(NodeAvailabilityWrapper);
impl FromStr for NodeAvailabilityArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self(NodeAvailabilityWrapper::Active)),
"offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
}
}
struct Client {
base_url: Url,
jwt_token: Option<String>,
client: reqwest::Client,
}
impl Client {
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
Self {
base_url,
jwt_token,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
}
}
/// Simple HTTP request wrapper for calling into storage controller
async fn dispatch<RQ, RS>(
&self,
method: Method,
path: String,
body: Option<RQ>,
) -> mgmt_api::Result<RS>
where
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
self.base_url.host_str().unwrap(),
self.base_url.port().unwrap()
))
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {
builder = builder.json(&body)
}
if let Some(jwt_token) = &self.jwt_token {
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
let response = response.error_from_body().await?;
response
.json()
.await
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
}
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
let mut trimmed = cli.api.to_string();
trimmed.pop();
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
match cli.command {
Command::NodeRegister {
node_id,
listen_pg_addr,
listen_pg_port,
listen_http_addr,
listen_http_port,
} => {
storcon_client
.dispatch::<_, ()>(
Method::POST,
"control/v1/node".to_string(),
Some(NodeRegisterRequest {
node_id,
listen_pg_addr,
listen_pg_port,
listen_http_addr,
listen_http_port,
}),
)
.await?;
}
Command::TenantCreate { tenant_id } => {
vps_client
.tenant_create(&TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: None,
shard_parameters: ShardParameters::default(),
placement_policy: Some(PlacementPolicy::Attached(1)),
config: TenantConfig::default(),
})
.await?;
}
Command::TenantDelete { tenant_id } => {
let status = vps_client
.tenant_delete(TenantShardId::unsharded(tenant_id))
.await?;
tracing::info!("Delete status: {}", status);
}
Command::Nodes {} => {
let resp = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
let mut table = comfy_table::Table::new();
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
for node in resp {
table.add_row([
format!("{}", node.id),
node.listen_http_addr,
format!("{:?}", node.scheduling),
format!("{:?}", node.availability),
]);
}
println!("{table}");
}
Command::NodeConfigure {
node_id,
availability,
scheduling,
} => {
let req = NodeConfigureRequest {
node_id,
availability: availability.map(|a| a.0),
scheduling,
};
storcon_client
.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/node/{node_id}/config"),
Some(req),
)
.await?;
}
Command::Tenants {} => {
let resp = storcon_client
.dispatch::<(), Vec<TenantDescribeResponse>>(
Method::GET,
"control/v1/tenant".to_string(),
None,
)
.await?;
let mut table = comfy_table::Table::new();
table.set_header([
"TenantId",
"ShardCount",
"StripeSize",
"Placement",
"Scheduling",
]);
for tenant in resp {
let shard_zero = tenant.shards.into_iter().next().unwrap();
table.add_row([
format!("{}", tenant.tenant_id),
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
format!("{:?}", tenant.stripe_size),
format!("{:?}", tenant.policy),
format!("{:?}", shard_zero.scheduling_policy),
]);
}
println!("{table}");
}
Command::TenantPolicy {
tenant_id,
placement,
scheduling,
} => {
let req = TenantPolicyRequest {
scheduling: scheduling.map(|s| s.0),
placement: placement.map(|p| p.0),
};
storcon_client
.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/policy"),
Some(req),
)
.await?;
}
Command::TenantShardSplit {
tenant_id,
shard_count,
stripe_size,
} => {
let req = TenantShardSplitRequest {
new_shard_count: shard_count,
new_stripe_size: stripe_size.map(ShardStripeSize),
};
let response = storcon_client
.dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/shard_split"),
Some(req),
)
.await?;
println!(
"Split tenant {} into {} shards: {}",
tenant_id,
shard_count,
response
.new_shards
.iter()
.map(|s| format!("{:?}", s))
.collect::<Vec<_>>()
.join(",")
);
}
Command::TenantShardMigrate {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest {
tenant_shard_id,
node_id: node,
};
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(req),
)
.await?;
}
Command::TenantConfig { tenant_id, config } => {
let tenant_conf = serde_json::from_str(&config)?;
vps_client
.tenant_config(&TenantConfigRequest {
tenant_id,
config: tenant_conf,
})
.await?;
}
Command::TenantScatter { tenant_id } => {
// Find the shards
let locate_response = storcon_client
.dispatch::<(), TenantLocateResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}/locate"),
None,
)
.await?;
let shards = locate_response.shards;
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
let shard_count = shards.len();
for s in shards {
let entry = node_to_shards.entry(s.node_id).or_default();
entry.push(s.shard_id);
}
// Load list of available nodes
let nodes_resp = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
for node in nodes_resp {
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
node_to_shards.entry(node.id).or_default();
}
}
let max_shard_per_node = shard_count / node_to_shards.len();
loop {
let mut migrate_shard = None;
for shards in node_to_shards.values_mut() {
if shards.len() > max_shard_per_node {
// Pick the emptiest
migrate_shard = Some(shards.pop().unwrap());
}
}
let Some(migrate_shard) = migrate_shard else {
break;
};
// Pick the emptiest node to migrate to
let mut destinations = node_to_shards
.iter()
.map(|(k, v)| (k, v.len()))
.collect::<Vec<_>>();
destinations.sort_by_key(|i| i.1);
let (destination_node, destination_count) = *destinations.first().unwrap();
if destination_count + 1 > max_shard_per_node {
// Even the emptiest destination doesn't have space: we're done
break;
}
let destination_node = *destination_node;
node_to_shards
.get_mut(&destination_node)
.unwrap()
.push(migrate_shard);
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{migrate_shard}/migrate"),
Some(TenantShardMigrateRequest {
tenant_shard_id: migrate_shard,
node_id: destination_node,
}),
)
.await?;
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
}
// Spread the shards across the nodes
}
Command::TenantDescribe { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let shards = describe_response.shards;
let mut table = comfy_table::Table::new();
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
for shard in shards {
let secondary = shard
.node_secondary
.iter()
.map(|n| format!("{}", n))
.collect::<Vec<_>>()
.join(",");
let mut status_parts = Vec::new();
if shard.is_reconciling {
status_parts.push("reconciling");
}
if shard.is_pending_compute_notification {
status_parts.push("pending_compute");
}
if shard.is_splitting {
status_parts.push("splitting");
}
let status = status_parts.join(",");
table.add_row([
format!("{}", shard.tenant_shard_id),
shard
.node_attached
.map(|n| format!("{}", n))
.unwrap_or(String::new()),
secondary,
shard.last_error,
status,
]);
}
println!("{table}");
}
Command::TenantWarmup { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await;
match describe_response {
Ok(describe) => {
if matches!(describe.policy, PlacementPolicy::Secondary) {
// Fine: it's already known to controller in secondary mode: calling
// again to put it into secondary mode won't cause problems.
} else {
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
}
}
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
// Fine: this tenant isn't know to the storage controller yet.
}
Err(e) => {
// Unexpected API error
return Err(e.into());
}
}
vps_client
.location_config(
TenantShardId::unsharded(tenant_id),
pageserver_api::models::LocationConfig {
mode: pageserver_api::models::LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: 0,
shard_count: 0,
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
tenant_conf: TenantConfig::default(),
},
None,
true,
)
.await?;
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let secondary_ps_id = describe_response
.shards
.first()
.unwrap()
.node_secondary
.first()
.unwrap();
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
loop {
let (status, progress) = vps_client
.tenant_secondary_download(
TenantShardId::unsharded(tenant_id),
Some(Duration::from_secs(10)),
)
.await?;
println!(
"Progress: {}/{} layers, {}/{} bytes",
progress.layers_downloaded,
progress.layers_total,
progress.bytes_downloaded,
progress.bytes_total
);
match status {
StatusCode::OK => {
println!("Download complete");
break;
}
StatusCode::ACCEPTED => {
// Loop
}
_ => {
anyhow::bail!("Unexpected download status: {status}");
}
}
}
}
}
Ok(())
}

View File

@@ -2,8 +2,8 @@
# see https://diesel.rs/guides/configuring-diesel-cli # see https://diesel.rs/guides/configuring-diesel-cli
[print_schema] [print_schema]
file = "storage_controller/src/schema.rs" file = "control_plane/attachment_service/src/schema.rs"
custom_type_derives = ["diesel::query_builder::QueryId"] custom_type_derives = ["diesel::query_builder::QueryId"]
[migrations_directory] [migrations_directory]
dir = "storage_controller/migrations" dir = "control_plane/attachment_service/migrations"

View File

@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
Neon storage broker, providing messaging between safekeepers and pageservers. Neon storage broker, providing messaging between safekeepers and pageservers.
[storage_broker.md](./storage_broker.md) [storage_broker.md](./storage_broker.md)
`storage_controller`:
Neon storage controller, manages a cluster of pageservers and exposes an API that enables
managing a many-sharded tenant as a single entity.
`/control_plane`: `/control_plane`:
Local control plane. Local control plane.

View File

@@ -1,150 +0,0 @@
# Storage Controller
## Concepts
The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
the underlying details of how data is spread across multiple nodes.
The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
## APIs
The storage controllers HTTP server implements four logically separate APIs:
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because thats where clients expect to find it on a pageserver.
- `/control/v1/...` path is the storage controllers API, which enables operations such as registering and management pageservers, or executing shard splits.
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
to ensure data safety with generation numbers.
The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers APIs).
See the `http.rs` file in the source for where the HTTP APIs are implemented.
## Database
The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
rebuilt on startup.
The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
The `diesel` crate is used for defining models & migrations.
Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controllers database.
### Diesel tip: migrations
If you need to modify the database schema, heres how to create a migration:
- Install the diesel CLI with `cargo install diesel_cli`
- Use `diesel migration generate <name>` to create a new migration
- Populate the SQL files in the `migrations/` subdirectory
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
- Commit the migration files and the changes to schema.rs
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once youve committed a migration no further steps are needed.
## storcon_cli
The `storcon_cli` tool enables interactive management of the storage controller. This is usually
only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
`storcon_cli --help` includes details on commands.
# Deploying
This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
part of a self-hosted system.
_General note: since the default `neon_local` environment includes a storage controller, this is a useful
reference when figuring out deployment._
## Database
It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
Set the URL to the database using the `--database-url` CLI option.
There is no need to run migrations manually: the storage controller automatically applies migrations
when it starts up.
## Configure pageservers to use the storage controller
1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
with the storage controller when it starts up. See the example below for the format of this file.
### Example `metadata.json`
```
{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
```
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
postgres runs.
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
the storage controller runs.
## Handle compute notifications.
The storage controller independently moves tenant attachments between pageservers in response to
changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
location changes.
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
the compute hook.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
```
struct ComputeHookNotifyRequestShard {
node_id: NodeId,
shard_number: ShardNumber,
}
struct ComputeHookNotifyRequest {
tenant_id: TenantId,
stripe_size: Option<ShardStripeSize>,
shards: Vec<ComputeHookNotifyRequestShard>,
}
```
When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
shards identified by `NodeId` must be converted to the address+port of the node.
- if stripe_size is not None, set `neon.stripe_size` to this value
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..
### Example notification body
```
{
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
"stripe_size": 32768,
"shards": [
{"node_id": 344, "shard_number": 0},
{"node_id": 722, "shard_number": 1},
],
}
```

View File

@@ -3,7 +3,7 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize, Serializer}; use serde::{Deserialize, Serialize, Serializer};
use crate::spec::{ComputeSpec, Database, Role}; use crate::spec::ComputeSpec;
#[derive(Serialize, Debug, Deserialize)] #[derive(Serialize, Debug, Deserialize)]
pub struct GenericAPIError { pub struct GenericAPIError {
@@ -113,12 +113,6 @@ pub struct ComputeMetrics {
pub total_ext_download_size: u64, pub total_ext_download_size: u64,
} }
#[derive(Clone, Debug, Default, Serialize)]
pub struct CatalogObjects {
pub roles: Vec<Role>,
pub databases: Vec<Database>,
}
/// Response of the `/computes/{compute_id}/spec` control-plane API. /// Response of the `/computes/{compute_id}/spec` control-plane API.
/// This is not actually a compute API response, so consider moving /// This is not actually a compute API response, so consider moving
/// to a different place. /// to a different place.

View File

@@ -33,23 +33,6 @@ pub struct ComputeSpec {
#[serde(default)] #[serde(default)]
pub features: Vec<ComputeFeature>, pub features: Vec<ComputeFeature>,
/// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs
/// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first
/// received.
///
/// Both this field and `--resize-swap-on-bind` are required, so that the control plane's
/// spec generation doesn't need to be aware of the actual compute it's running on, while
/// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could
/// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus
/// giving every VM much more swap than it should have (32GiB).
///
/// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for
/// enabling the swap resizing behavior once rollout is complete.
///
/// See neondatabase/cloud#12047 for more.
#[serde(default)]
pub swap_size_bytes: Option<u64>,
/// Expected cluster state at the end of transition process. /// Expected cluster state at the end of transition process.
pub cluster: Cluster, pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>, pub delta_operations: Option<Vec<DeltaOp>>,

View File

@@ -10,13 +10,11 @@ libc.workspace = true
once_cell.workspace = true once_cell.workspace = true
chrono.workspace = true chrono.workspace = true
twox-hash.workspace = true twox-hash.workspace = true
measured.workspace = true
workspace_hack.workspace = true workspace_hack.workspace = true
[target.'cfg(target_os = "linux")'.dependencies] [target.'cfg(target_os = "linux")'.dependencies]
procfs.workspace = true procfs.workspace = true
measured-process.workspace = true
[dev-dependencies] [dev-dependencies]
rand = "0.8" rand = "0.8"

View File

@@ -7,19 +7,14 @@
//! use significantly less memory than this, but can only approximate the cardinality. //! use significantly less memory than this, but can only approximate the cardinality.
use std::{ use std::{
hash::{BuildHasher, BuildHasherDefault, Hash}, collections::HashMap,
sync::atomic::AtomicU8, hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
sync::{atomic::AtomicU8, Arc, RwLock},
}; };
use measured::{ use prometheus::{
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, core::{self, Describer},
metric::{ proto, Opts,
group::{Encoding, MetricValue},
name::MetricNameEncoder,
Metric, MetricType, MetricVec,
},
text::TextEncoder,
LabelGroup,
}; };
use twox_hash::xxh3; use twox_hash::xxh3;
@@ -98,25 +93,203 @@ macro_rules! register_hll {
/// ``` /// ```
/// ///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>; #[derive(Clone)]
pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>; pub struct HyperLogLogVec<const N: usize> {
core: Arc<HyperLogLogVecCore<N>>,
pub struct HyperLogLogState<const N: usize> {
shards: [AtomicU8; N],
} }
impl<const N: usize> Default for HyperLogLogState<N> {
fn default() -> Self { struct HyperLogLogVecCore<const N: usize> {
#[allow(clippy::declare_interior_mutable_const)] pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
const ZERO: AtomicU8 = AtomicU8::new(0); pub desc: core::Desc,
Self { shards: [ZERO; N] } pub opts: Opts,
}
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
for child in self.core.children.read().unwrap().values() {
child.core.collect_into(&mut metrics);
}
m.set_metric(metrics);
vec![m]
} }
} }
impl<const N: usize> MetricType for HyperLogLogState<N> { impl<const N: usize> HyperLogLogVec<N> {
type Metadata = (); /// Create a new [`HyperLogLogVec`] based on the provided
/// [`Opts`] and partitioned by the given label names. At least one label name must be
/// provided.
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
let opts = opts.variable_labels(variable_names);
let desc = opts.describe()?;
let v = HyperLogLogVecCore {
children: RwLock::new(HashMap::default()),
desc,
opts,
};
Ok(Self { core: Arc::new(v) })
}
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
self.core.get_metric_with_label_values(vals)
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
self.get_metric_with_label_values(vals).unwrap()
}
} }
impl<const N: usize> HyperLogLogState<N> { impl<const N: usize> HyperLogLogVecCore<N> {
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let h = self.hash_label_values(vals)?;
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
return Ok(metric);
}
self.get_or_create_metric(h, vals)
}
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
if vals.len() != self.desc.variable_labels.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: self.desc.variable_labels.len(),
got: vals.len(),
});
}
let mut h = xxh3::Hash64::default();
for val in vals {
h.write(val.as_bytes());
}
Ok(h.finish())
}
fn get_or_create_metric(
&self,
hash: u64,
label_values: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let mut children = self.children.write().unwrap();
// Check exist first.
if let Some(metric) = children.get(&hash).cloned() {
return Ok(metric);
}
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
children.insert(hash, metric.clone());
Ok(metric)
}
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLog<const N: usize> {
core: Arc<HyperLogLogCore<N>>,
}
impl<const N: usize> HyperLogLog<N> {
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let opts = Opts::new(name, help);
Self::with_opts(opts)
}
/// Create a [`HyperLogLog`] with the `opts` options.
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
Self::with_opts_and_label_values(&opts, &[])
}
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
let desc = opts.describe()?;
let labels = make_label_pairs(&desc, label_values)?;
let v = HyperLogLogCore {
shards: [0; N].map(AtomicU8::new),
desc,
labels,
};
Ok(Self { core: Arc::new(v) })
}
pub fn measure(&self, item: &impl Hash) { pub fn measure(&self, item: &impl Hash) {
// changing the hasher will break compatibility with previous measurements. // changing the hasher will break compatibility with previous measurements.
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item)); self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -126,11 +299,42 @@ impl<const N: usize> HyperLogLogState<N> {
let p = N.ilog2() as u8; let p = N.ilog2() as u8;
let j = hash & (N as u64 - 1); let j = hash & (N as u64 - 1);
let rho = (hash >> p).leading_zeros() as u8 + 1 - p; let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
}
}
struct HyperLogLogCore<const N: usize> {
shards: [AtomicU8; N],
desc: core::Desc,
labels: Vec<proto::LabelPair>,
}
impl<const N: usize> core::Collector for HyperLogLog<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
} }
fn take_sample(&self) -> [u8; N] { fn collect(&self) -> Vec<proto::MetricFamily> {
self.shards.each_ref().map(|x| { let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
self.core.collect_into(&mut metrics);
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogCore<N> {
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
self.shards.iter().enumerate().for_each(|(i, x)| {
let mut shard_label = proto::LabelPair::default();
shard_label.set_name("hll_shard".to_owned());
shard_label.set_value(format!("{i}"));
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
// This seems like it would be a race condition, // This seems like it would be a race condition,
@@ -140,90 +344,85 @@ impl<const N: usize> HyperLogLogState<N> {
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
// this would mean that a dev port-forwarding the metrics url won't break the sampling. // this would mean that a dev port-forwarding the metrics url won't break the sampling.
x.swap(0, std::sync::atomic::Ordering::Relaxed) let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
let mut m = proto::Metric::default();
let mut c = proto::Gauge::default();
c.set_value(v as f64);
m.set_gauge(c);
let mut labels = Vec::with_capacity(self.labels.len() + 1);
labels.extend_from_slice(&self.labels);
labels.push(shard_label);
m.set_label(labels);
metrics.push(m);
}) })
} }
} }
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
for HyperLogLogState<N> fn make_label_pairs(
{ desc: &core::Desc,
fn write_type( label_values: &[&str],
name: impl MetricNameEncoder, ) -> prometheus::Result<Vec<proto::LabelPair>> {
enc: &mut TextEncoder<W>, if desc.variable_labels.len() != label_values.len() {
) -> Result<(), std::io::Error> { return Err(prometheus::Error::InconsistentCardinality {
enc.write_type(&name, measured::text::MetricType::Gauge) expect: desc.variable_labels.len(),
got: label_values.len(),
});
} }
fn collect_into(
&self,
_: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
struct I64(i64);
impl LabelValue for I64 {
fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0)
}
}
struct HllShardLabel { let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
hll_shard: i64, if total_len == 0 {
} return Ok(vec![]);
impl LabelGroup for HllShardLabel {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const LE: &LabelName = LabelName::from_str("hll_shard");
v.write_value(LE, &I64(self.hll_shard));
}
}
self.take_sample()
.into_iter()
.enumerate()
.try_for_each(|(hll_shard, val)| {
enc.write_metric_value(
name.by_ref(),
labels.by_ref().compose_with(HllShardLabel {
hll_shard: hll_shard as i64,
}),
MetricValue::Int(val as i64),
)
})
} }
if desc.variable_labels.is_empty() {
return Ok(desc.const_label_pairs.clone());
}
let mut label_pairs = Vec::with_capacity(total_len);
for (i, n) in desc.variable_labels.iter().enumerate() {
let mut label_pair = proto::LabelPair::default();
label_pair.set_name(n.clone());
label_pair.set_value(label_values[i].to_owned());
label_pairs.push(label_pair);
}
for label_pair in &desc.const_label_pairs {
label_pairs.push(label_pair.clone());
}
label_pairs.sort();
Ok(label_pairs)
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::collections::HashSet; use std::collections::HashSet;
use measured::{label::StaticLabelSet, FixedCardinalityLabel}; use prometheus::{proto, Opts};
use rand::{rngs::StdRng, Rng, SeedableRng}; use rand::{rngs::StdRng, Rng, SeedableRng};
use rand_distr::{Distribution, Zipf}; use rand_distr::{Distribution, Zipf};
use crate::HyperLogLogVec; use crate::HyperLogLogVec;
#[derive(FixedCardinalityLabel, Clone, Copy)] fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
#[label(singleton = "x")] let mut metrics = vec![];
enum Label { hll.core
A, .children
B, .read()
.unwrap()
.values()
.for_each(|c| c.core.collect_into(&mut metrics));
metrics
} }
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
// cannot go through the `hll.collect_family_into` interface yet...
// need to see if I can fix the conflicting impls problem in measured.
(
hll.get_metric(hll.with_labels(Label::A)).take_sample(),
hll.get_metric(hll.with_labels(Label::B)).take_sample(),
)
}
fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
let mut buckets = [0.0; 32]; let mut buckets = [0.0; 32];
for &sample in samples { for metric in metrics.chunks_exact(32) {
for (i, m) in sample.into_iter().enumerate() { if filter(&metric[0]) {
buckets[i] = f64::max(buckets[i], m as f64); for (i, m) in metric.iter().enumerate() {
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
}
} }
} }
@@ -238,7 +437,7 @@ mod tests {
} }
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) { fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new(); let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
let mut set_a = HashSet::new(); let mut set_a = HashSet::new();
@@ -246,20 +445,18 @@ mod tests {
for x in iter.by_ref().take(n) { for x in iter.by_ref().take(n) {
set_a.insert(x.to_bits()); set_a.insert(x.to_bits());
hll.get_metric(hll.with_labels(Label::A)) hll.with_label_values(&["a"]).measure(&x.to_bits());
.measure(&x.to_bits());
} }
for x in iter.by_ref().take(n) { for x in iter.by_ref().take(n) {
set_b.insert(x.to_bits()); set_b.insert(x.to_bits());
hll.get_metric(hll.with_labels(Label::B)) hll.with_label_values(&["b"]).measure(&x.to_bits());
.measure(&x.to_bits());
} }
let merge = &set_a | &set_b; let merge = &set_a | &set_b;
let (a, b) = collect(&hll); let metrics = collect(&hll);
let len = get_cardinality(&[a, b]); let len = get_cardinality(&metrics, |_| true);
let len_a = get_cardinality(&[a]); let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
let len_b = get_cardinality(&[b]); let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
} }

View File

@@ -4,17 +4,6 @@
//! a default registry. //! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)] #![deny(clippy::undocumented_unsafe_blocks)]
use measured::{
label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
metric::{
counter::CounterState,
gauge::GaugeState,
group::{Encoding, MetricValue},
name::{MetricName, MetricNameEncoder},
MetricEncoding, MetricFamilyEncoding,
},
FixedCardinalityLabel, LabelGroup, MetricGroup,
};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use prometheus::core::{ use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -22,7 +11,6 @@ use prometheus::core::{
pub use prometheus::opts; pub use prometheus::opts;
pub use prometheus::register; pub use prometheus::register;
pub use prometheus::Error; pub use prometheus::Error;
use prometheus::Registry;
pub use prometheus::{core, default_registry, proto}; pub use prometheus::{core, default_registry, proto};
pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{exponential_buckets, linear_buckets};
pub use prometheus::{register_counter_vec, Counter, CounterVec}; pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -35,12 +23,13 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge, IntGauge};
pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
pub use prometheus::{Encoder, TextEncoder}; pub use prometheus::{Encoder, TextEncoder};
use prometheus::{Registry, Result};
pub mod launch_timestamp; pub mod launch_timestamp;
mod wrappers; mod wrappers;
pub use wrappers::{CountedReader, CountedWriter}; pub use wrappers::{CountedReader, CountedWriter};
mod hll; mod hll;
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; pub use hll::{HyperLogLog, HyperLogLogVec};
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
pub mod more_process_metrics; pub mod more_process_metrics;
@@ -70,7 +59,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`. /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
/// while holding the lock. /// while holding the lock.
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> { pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
INTERNAL_REGISTRY.register(c) INTERNAL_REGISTRY.register(c)
} }
@@ -107,127 +96,6 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
]; ];
pub struct BuildInfo {
pub revision: &'static str,
pub build_tag: &'static str,
}
// todo: allow label group without the set
impl LabelGroup for BuildInfo {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const REVISION: &LabelName = LabelName::from_str("revision");
v.write_value(REVISION, &self.revision);
const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
v.write_value(BUILD_TAG, &self.build_tag);
}
}
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
where
GaugeState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
enc.write_help(&name, "Build/version information")?;
GaugeState::write_type(&name, enc)?;
GaugeState {
count: std::sync::atomic::AtomicI64::new(1),
}
.collect_into(&(), self, name, enc)
}
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct NeonMetrics {
#[cfg(target_os = "linux")]
#[metric(namespace = "process")]
#[metric(init = measured_process::ProcessCollector::for_self())]
process: measured_process::ProcessCollector,
#[metric(namespace = "libmetrics")]
#[metric(init = LibMetrics::new(build_info))]
libmetrics: LibMetrics,
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct LibMetrics {
#[metric(init = build_info)]
build_info: BuildInfo,
#[metric(flatten)]
rusage: Rusage,
serve_count: CollectionCounter,
}
fn write_gauge<Enc: Encoding>(
x: i64,
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Enc,
) -> Result<(), Enc::Err> {
enc.write_metric_value(name, labels, MetricValue::Int(x))
}
#[derive(Default)]
struct Rusage;
#[derive(FixedCardinalityLabel, Clone, Copy)]
#[label(singleton = "io_operation")]
enum IoOp {
Read,
Write,
}
impl<T: Encoding> MetricGroup<T> for Rusage
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
let ru = get_rusage_stats();
enc.write_help(
DISK_IO,
"Bytes written and read from disk, grouped by the operation (read|write)",
)?;
GaugeState::write_type(DISK_IO, enc)?;
write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
GaugeState::write_type(MAXRSS, enc)?;
write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
Ok(())
}
}
#[derive(Default)]
struct CollectionCounter(CounterState);
impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
where
CounterState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
self.0.inc();
enc.write_help(&name, "Number of metric requests made")?;
self.0.collect_into(&(), NoLabels, name, enc)
}
}
pub fn set_build_info_metric(revision: &str, build_tag: &str) { pub fn set_build_info_metric(revision: &str, build_tag: &str) {
let metric = register_int_gauge_vec!( let metric = register_int_gauge_vec!(
"libmetrics_build_info", "libmetrics_build_info",
@@ -237,7 +105,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
.expect("Failed to register build info metric"); .expect("Failed to register build info metric");
metric.with_label_values(&[revision, build_tag]).set(1); metric.with_label_values(&[revision, build_tag]).set(1);
} }
const BYTES_IN_BLOCK: i64 = 512;
// Records I/O stats in a "cross-platform" way. // Records I/O stats in a "cross-platform" way.
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -250,22 +117,14 @@ const BYTES_IN_BLOCK: i64 = 512;
fn update_rusage_metrics() { fn update_rusage_metrics() {
let rusage_stats = get_rusage_stats(); let rusage_stats = get_rusage_stats();
const BYTES_IN_BLOCK: i64 = 512;
DISK_IO_BYTES DISK_IO_BYTES
.with_label_values(&["read"]) .with_label_values(&["read"])
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
DISK_IO_BYTES DISK_IO_BYTES
.with_label_values(&["write"]) .with_label_values(&["write"])
.set(rusage_stats.ru_oublock * BYTES_IN_BLOCK); .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
MAXRSS_KB.set(rusage_stats.ru_maxrss);
// On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
#[cfg(target_os = "macos")]
{
MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
}
#[cfg(not(target_os = "macos"))]
{
MAXRSS_KB.set(rusage_stats.ru_maxrss);
}
} }
fn get_rusage_stats() -> libc::rusage { fn get_rusage_stats() -> libc::rusage {
@@ -292,7 +151,6 @@ macro_rules! register_int_counter_pair_vec {
} }
}}; }};
} }
/// Create an [`IntCounterPair`] and registers to default registry. /// Create an [`IntCounterPair`] and registers to default registry.
#[macro_export(local_inner_macros)] #[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair { macro_rules! register_int_counter_pair {
@@ -330,10 +188,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
/// ///
/// An error is returned if the number of label values is not the same as the /// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc. /// number of VariableLabels in Desc.
pub fn get_metric_with_label_values( pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
&self,
vals: &[&str],
) -> prometheus::Result<GenericCounterPair<P>> {
Ok(GenericCounterPair { Ok(GenericCounterPair {
inc: self.inc.get_metric_with_label_values(vals)?, inc: self.inc.get_metric_with_label_values(vals)?,
dec: self.dec.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?,
@@ -346,7 +201,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
self.get_metric_with_label_values(vals).unwrap() self.get_metric_with_label_values(vals).unwrap()
} }
pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
res[0] = self.inc.remove_label_values(vals); res[0] = self.inc.remove_label_values(vals);
res[1] = self.dec.remove_label_values(vals); res[1] = self.dec.remove_label_values(vals);
} }
@@ -430,180 +285,3 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>; pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
pub trait CounterPairAssoc {
const INC_NAME: &'static MetricName;
const DEC_NAME: &'static MetricName;
const INC_HELP: &'static str;
const DEC_HELP: &'static str;
type LabelGroupSet: LabelGroupSet;
}
pub struct CounterPairVec<A: CounterPairAssoc> {
vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
}
impl<A: CounterPairAssoc> Default for CounterPairVec<A>
where
A::LabelGroupSet: Default,
{
fn default() -> Self {
Self {
vec: Default::default(),
}
}
}
impl<A: CounterPairAssoc> CounterPairVec<A> {
pub fn guard(
&self,
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
) -> MeasuredCounterPairGuard<'_, A> {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).inc.inc();
MeasuredCounterPairGuard { vec: &self.vec, id }
}
pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).inc.inc();
}
pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).dec.inc();
}
pub fn remove_metric(
&self,
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
) -> Option<MeasuredCounterPairState> {
let id = self.vec.with_labels(labels);
self.vec.remove_metric(id)
}
pub fn sample(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) -> u64 {
let id = self.vec.with_labels(labels);
let metric = self.vec.get_metric(id);
let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed);
let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed);
inc.saturating_sub(dec)
}
}
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
where
T: ::measured::metric::group::Encoding,
A: CounterPairAssoc,
::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
// write decrement first to avoid a race condition where inc - dec < 0
T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
self.vec
.collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
self.vec
.collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
Ok(())
}
}
#[derive(MetricGroup, Default)]
pub struct MeasuredCounterPairState {
pub inc: CounterState,
pub dec: CounterState,
}
impl measured::metric::MetricType for MeasuredCounterPairState {
type Metadata = ();
}
pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
id: measured::metric::LabelId<A::LabelGroupSet>,
}
impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
fn drop(&mut self) {
self.vec.get_metric(self.id).dec.inc();
}
}
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
struct Inc<T>(T);
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
struct Dec<T>(T);
impl<T: Encoding> Encoding for Inc<T> {
type Err = T::Err;
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
where
CounterState: MetricEncoding<T>,
{
fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
CounterState::write_type(name, &mut enc.0)
}
fn collect_into(
&self,
metadata: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Inc<T>,
) -> Result<(), T::Err> {
self.inc.collect_into(metadata, labels, name, &mut enc.0)
}
}
impl<T: Encoding> Encoding for Dec<T> {
type Err = T::Err;
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
/// Write the dec counter to the encoder
impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
where
CounterState: MetricEncoding<T>,
{
fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
CounterState::write_type(name, &mut enc.0)
}
fn collect_into(
&self,
metadata: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Dec<T>,
) -> Result<(), T::Err> {
self.dec.collect_into(metadata, labels, name, &mut enc.0)
}
}

View File

@@ -1,31 +0,0 @@
use std::collections::HashMap;
use const_format::formatcp;
#[cfg(test)]
mod tests;
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
// as a separate structure. This information is not neeed by the pageserver
// itself, it is only used for registering the pageserver with the control
// plane and/or storage controller.
//
#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
pub struct NodeMetadata {
#[serde(rename = "host")]
pub postgres_host: String,
#[serde(rename = "port")]
pub postgres_port: u16,
pub http_host: String,
pub http_port: u16,
// Deployment tools may write fields to the metadata file beyond what we
// use in this type: this type intentionally only names fields that require.
#[serde(flatten)]
pub other: HashMap<String, serde_json::Value>,
}

View File

@@ -1,22 +0,0 @@
use super::*;
#[test]
fn test_node_metadata_v1_backward_compatibilty() {
let v1 = serde_json::to_vec(&serde_json::json!({
"host": "localhost",
"port": 23,
"http_host": "localhost",
"http_port": 42,
}));
assert_eq!(
serde_json::from_slice::<NodeMetadata>(&v1.unwrap()).unwrap(),
NodeMetadata {
postgres_host: "localhost".to_string(),
postgres_port: 23,
http_host: "localhost".to_string(),
http_port: 42,
other: HashMap::new(),
}
)
}

View File

@@ -2,9 +2,9 @@ use std::str::FromStr;
/// Request/response types for the storage controller /// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server /// API (`/control/v1` prefix). Implemented by the server
/// in [`storage_controller::http`] /// in [`attachment_service::http`]
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId}; use utils::id::NodeId;
use crate::{ use crate::{
models::{ShardParameters, TenantConfig}, models::{ShardParameters, TenantConfig},
@@ -42,12 +42,6 @@ pub struct NodeConfigureRequest {
pub scheduling: Option<NodeSchedulingPolicy>, pub scheduling: Option<NodeSchedulingPolicy>,
} }
#[derive(Serialize, Deserialize)]
pub struct TenantPolicyRequest {
pub placement: Option<PlacementPolicy>,
pub scheduling: Option<ShardSchedulingPolicy>,
}
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
pub struct TenantLocateResponseShard { pub struct TenantLocateResponseShard {
pub shard_id: TenantShardId, pub shard_id: TenantShardId,
@@ -68,27 +62,12 @@ pub struct TenantLocateResponse {
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct TenantDescribeResponse { pub struct TenantDescribeResponse {
pub tenant_id: TenantId,
pub shards: Vec<TenantDescribeResponseShard>, pub shards: Vec<TenantDescribeResponseShard>,
pub stripe_size: ShardStripeSize, pub stripe_size: ShardStripeSize,
pub policy: PlacementPolicy, pub policy: PlacementPolicy,
pub config: TenantConfig, pub config: TenantConfig,
} }
#[derive(Serialize, Deserialize)]
pub struct NodeDescribeResponse {
pub id: NodeId,
pub availability: NodeAvailabilityWrapper,
pub scheduling: NodeSchedulingPolicy,
pub listen_http_addr: String,
pub listen_http_port: u16,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
}
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct TenantDescribeResponseShard { pub struct TenantDescribeResponseShard {
pub tenant_shard_id: TenantShardId, pub tenant_shard_id: TenantShardId,
@@ -104,8 +83,6 @@ pub struct TenantDescribeResponseShard {
pub is_pending_compute_notification: bool, pub is_pending_compute_notification: bool,
/// A shard split is currently underway /// A shard split is currently underway
pub is_splitting: bool, pub is_splitting: bool,
pub scheduling_policy: ShardSchedulingPolicy,
} }
/// Explicitly migrating a particular shard is a low level operation /// Explicitly migrating a particular shard is a low level operation
@@ -120,7 +97,7 @@ pub struct TenantShardMigrateRequest {
/// Utilisation score indicating how good a candidate a pageserver /// Utilisation score indicating how good a candidate a pageserver
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`]. /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
/// Lower values are better. /// Lower values are better.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)] #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
pub struct UtilizationScore(pub u64); pub struct UtilizationScore(pub u64);
impl UtilizationScore { impl UtilizationScore {
@@ -129,7 +106,7 @@ impl UtilizationScore {
} }
} }
#[derive(Serialize, Deserialize, Clone, Copy, Debug)] #[derive(Serialize, Clone, Copy)]
#[serde(into = "NodeAvailabilityWrapper")] #[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability { pub enum NodeAvailability {
// Normal, happy state // Normal, happy state
@@ -152,7 +129,7 @@ impl Eq for NodeAvailability {}
// This wrapper provides serde functionality and it should only be used to // This wrapper provides serde functionality and it should only be used to
// communicate with external callers which don't know or care about the // communicate with external callers which don't know or care about the
// utilisation score of the pageserver it is targeting. // utilisation score of the pageserver it is targeting.
#[derive(Serialize, Deserialize, Clone, Copy, Debug)] #[derive(Serialize, Deserialize, Clone)]
pub enum NodeAvailabilityWrapper { pub enum NodeAvailabilityWrapper {
Active, Active,
Offline, Offline,
@@ -178,33 +155,22 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
} }
} }
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] impl FromStr for NodeAvailability {
pub enum ShardSchedulingPolicy { type Err = anyhow::Error;
// Normal mode: the tenant's scheduled locations may be updated at will, including
// for non-essential optimization.
Active,
// Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy. fn from_str(s: &str) -> Result<Self, Self::Err> {
// For example, this still permits a node's attachment location to change to a secondary in match s {
// response to a node failure, or to assign a new secondary if a node was removed. // This is used when parsing node configuration requests from neon-local.
Essential, // Assume the worst possible utilisation score
// and let it get updated via the heartbeats.
// No scheduling: leave the shard running wherever it currently is. Even if the shard is "active" => Ok(Self::Active(UtilizationScore::worst())),
// unavailable, it will not be rescheduled to another node. "offline" => Ok(Self::Offline),
Pause, _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
// No reconciling: we will make no location_conf API calls to pageservers at all. If the
// shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over.
Stop,
}
impl Default for ShardSchedulingPolicy {
fn default() -> Self {
Self::Active
} }
} }
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
pub enum NodeSchedulingPolicy { pub enum NodeSchedulingPolicy {
Active, Active,
Filling, Filling,

View File

@@ -1,6 +1,5 @@
use anyhow::{bail, Result}; use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE}; use byteorder::{ByteOrder, BE};
use bytes::BufMut;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::{Oid, TransactionId}; use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -22,107 +21,15 @@ pub struct Key {
pub field6: u32, pub field6: u32,
} }
/// The storage key size.
pub const KEY_SIZE: usize = 18; pub const KEY_SIZE: usize = 18;
/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
/// See [`Key::to_i128`] for more information on the encoding.
pub const METADATA_KEY_SIZE: usize = 16;
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
/// The (reserved) key prefix of relation sizes.
pub const RELATION_SIZE_PREFIX: u8 = 0x61;
/// The key prefix of AUX file keys.
pub const AUX_KEY_PREFIX: u8 = 0x62;
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
}
impl Key { impl Key {
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key(&self) -> bool {
self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
}
/// Encode a metadata key to a storage key.
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
assert!(is_metadata_key_slice(key), "key not in metadata key range");
Key {
field1: key[0],
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
field5: key[11],
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
}
}
/// Encode a metadata key to a storage key.
pub fn from_metadata_key(key: &[u8]) -> Self {
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
}
/// Extract a metadata key to a writer. The result should always be 16 bytes.
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
writer.put_u8(self.field1);
assert!(self.field2 <= 0xFFFF);
writer.put_u16(self.field2 as u16);
writer.put_u32(self.field3);
writer.put_u32(self.field4);
writer.put_u8(self.field5);
writer.put_u32(self.field6);
}
/// Get the range of metadata keys.
pub const fn metadata_key_range() -> Range<Self> {
Key {
field1: METADATA_KEY_BEGIN_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: METADATA_KEY_END_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
/// Get the range of aux keys.
pub fn metadata_aux_key_range() -> Range<Self> {
Key {
field1: AUX_KEY_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: AUX_KEY_PREFIX + 1,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
/// As long as Neon does not support tablespace (because of lack of access to local file system), /// As long as Neon does not support tablespace (because of lack of access to local file system),
/// we can assume that only some predefined namespace OIDs are used which can fit in u16 /// we can assume that only some predefined namespace OIDs are used which can fit in u16
pub fn to_i128(&self) -> i128 { pub fn to_i128(&self) -> i128 {
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
(((self.field1 & 0x7F) as i128) << 120) (((self.field1 & 0xf) as i128) << 120)
| (((self.field2 & 0xFFFF) as i128) << 104) | (((self.field2 & 0xFFFF) as i128) << 104)
| ((self.field3 as i128) << 72) | ((self.field3 as i128) << 72)
| ((self.field4 as i128) << 40) | ((self.field4 as i128) << 40)
@@ -132,7 +39,7 @@ impl Key {
pub const fn from_i128(x: i128) -> Self { pub const fn from_i128(x: i128) -> Self {
Key { Key {
field1: ((x >> 120) & 0x7F) as u8, field1: ((x >> 120) & 0xf) as u8,
field2: ((x >> 104) & 0xFFFF) as u32, field2: ((x >> 104) & 0xFFFF) as u32,
field3: (x >> 72) as u32, field3: (x >> 72) as u32,
field4: (x >> 40) as u32, field4: (x >> 40) as u32,
@@ -141,11 +48,11 @@ impl Key {
} }
} }
pub const fn next(&self) -> Key { pub fn next(&self) -> Key {
self.add(1) self.add(1)
} }
pub const fn add(&self, x: u32) -> Key { pub fn add(&self, x: u32) -> Key {
let mut key = *self; let mut key = *self;
let r = key.field6.overflowing_add(x); let r = key.field6.overflowing_add(x);
@@ -174,8 +81,6 @@ impl Key {
key key
} }
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::from_metadata_key`] instead.
pub fn from_slice(b: &[u8]) -> Self { pub fn from_slice(b: &[u8]) -> Self {
Key { Key {
field1: b[0], field1: b[0],
@@ -187,8 +92,6 @@ impl Key {
} }
} }
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::extract_metadata_key_to_writer`] instead.
pub fn write_to_byte_slice(&self, buf: &mut [u8]) { pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
buf[0] = self.field1; buf[0] = self.field1;
BE::write_u32(&mut buf[1..5], self.field2); BE::write_u32(&mut buf[1..5], self.field2);
@@ -572,17 +475,12 @@ pub const AUX_FILES_KEY: Key = Key {
// Reverse mappings for a few Keys. // Reverse mappings for a few Keys.
// These are needed by WAL redo manager. // These are needed by WAL redo manager.
/// Non inherited range for vectored get.
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
// AUX_FILES currently stores only data for logical replication (slots etc), and // AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline // we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these. // switch (and generally it likely should be optional), so ignore these.
#[inline(always)] #[inline(always)]
pub fn is_inherited_key(key: Key) -> bool { pub fn is_inherited_key(key: Key) -> bool {
!NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key) key != AUX_FILES_KEY
} }
#[inline(always)] #[inline(always)]
@@ -658,14 +556,11 @@ impl std::str::FromStr for Key {
mod tests { mod tests {
use std::str::FromStr; use std::str::FromStr;
use crate::key::is_metadata_key_slice;
use crate::key::Key; use crate::key::Key;
use rand::Rng; use rand::Rng;
use rand::SeedableRng; use rand::SeedableRng;
use super::AUX_KEY_PREFIX;
#[test] #[test]
fn display_fromstr_bijection() { fn display_fromstr_bijection() {
let mut rng = rand::rngs::StdRng::seed_from_u64(42); let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -681,16 +576,4 @@ mod tests {
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
} }
#[test]
fn test_metadata_keys() {
let mut metadata_key = vec![AUX_KEY_PREFIX];
metadata_key.extend_from_slice(&[0xFF; 15]);
let encoded_key = Key::from_metadata_key(&metadata_key);
let mut output_key = Vec::new();
encoded_key.extract_metadata_key_to_writer(&mut output_key);
assert_eq!(metadata_key, output_key);
assert!(encoded_key.is_metadata_key());
assert!(is_metadata_key_slice(&metadata_key));
}
} }

View File

@@ -1,10 +1,7 @@
use postgres_ffi::BLCKSZ; use postgres_ffi::BLCKSZ;
use std::ops::Range; use std::ops::Range;
use crate::{ use crate::key::Key;
key::Key,
shard::{ShardCount, ShardIdentity},
};
use itertools::Itertools; use itertools::Itertools;
/// ///
@@ -17,279 +14,44 @@ pub struct KeySpace {
pub ranges: Vec<Range<Key>>, pub ranges: Vec<Range<Key>>,
} }
/// A wrapper type for sparse keyspaces.
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct SparseKeySpace(pub KeySpace);
/// Represents a contiguous half-open range of the keyspace, masked according to a particular
/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
/// shard.
///
/// When we iterate over keys within this object, we will skip any keys that don't belong
/// to this shard.
///
/// The start + end keys may not belong to the shard: these specify where layer files should
/// start + end, but we will never actually read/write those keys.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ShardedRange<'a> {
pub shard_identity: &'a ShardIdentity,
pub range: Range<Key>,
}
// Calculate the size of a range within the blocks of the same relation, or spanning only the
// top page in the previous relation's space.
fn contiguous_range_len(range: &Range<Key>) -> u32 {
debug_assert!(is_contiguous_range(range));
if range.start.field6 == 0xffffffff {
range.end.field6 + 1
} else {
range.end.field6 - range.start.field6
}
}
/// Return true if this key range includes only keys in the same relation's data blocks, or
/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
///
/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
/// be on our shard. Later in ShardedRange we do the extra work to figure out how much
/// of a given contiguous range is present on one shard.
///
/// This matters, because:
/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse.
/// - Within such ranges, we may calculate distances using simple subtraction of field6.
fn is_contiguous_range(range: &Range<Key>) -> bool {
range.start.field1 == range.end.field1
&& range.start.field2 == range.end.field2
&& range.start.field3 == range.end.field3
&& range.start.field4 == range.end.field4
&& (range.start.field5 == range.end.field5
|| (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
}
impl<'a> ShardedRange<'a> {
pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
Self {
shard_identity,
range,
}
}
/// Break up this range into chunks, each of which has at least one local key in it if the
/// total range has at least one local key.
pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
// Optimization for single-key case (e.g. logical size keys)
if self.range.end == self.range.start.add(1) {
return vec![(
if self.shard_identity.is_key_disposable(&self.range.start) {
0
} else {
1
},
self.range,
)];
}
if !is_contiguous_range(&self.range) {
// Ranges that span relations are not fragmented. We only get these ranges as a result
// of operations that act on existing layers, so we trust that the existing range is
// reasonably small.
return vec![(u32::MAX, self.range)];
}
let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
let mut cursor = self.range.start;
while cursor < self.range.end {
let advance_by = self.distance_to_next_boundary(cursor);
let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
// If the previous fragment is undersized, then we seek to consume enough
// blocks to complete it.
let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
Some(frag) => {
// Prev block is complete, want the full number.
(
target_nblocks,
if is_fragment_disposable {
// If this current range will be empty (not shard-local data), we will merge into previous
Some(frag)
} else {
None
},
)
}
None => {
// First iteration, want the full number
(target_nblocks, None)
}
};
let advance_by = if is_fragment_disposable {
advance_by
} else {
std::cmp::min(advance_by, want_blocks)
};
let next_cursor = cursor.add(advance_by);
let this_frag = (
if is_fragment_disposable {
0
} else {
advance_by
},
cursor..next_cursor,
);
cursor = next_cursor;
if let Some(last_fragment) = merge_last_fragment {
// Previous fragment was short or this one is empty, merge into it
last_fragment.0 += this_frag.0;
last_fragment.1.end = this_frag.1.end;
} else {
fragments.push(this_frag);
}
}
fragments
}
/// Estimate the physical pages that are within this range, on this shard. This returns
/// u32::MAX if the range spans relations: this return value should be interpreted as "large".
pub fn page_count(&self) -> u32 {
// Special cases for single keys like logical sizes
if self.range.end == self.range.start.add(1) {
return if self.shard_identity.is_key_disposable(&self.range.start) {
0
} else {
1
};
}
// We can only do an authentic calculation of contiguous key ranges
if !is_contiguous_range(&self.range) {
return u32::MAX;
}
// Special case for single sharded tenants: our logical and physical sizes are the same
if self.shard_identity.count < ShardCount::new(2) {
return contiguous_range_len(&self.range);
}
// Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
// to Self, and add the stripe's block count to our total if so.
let mut result: u64 = 0;
let mut cursor = self.range.start;
while cursor < self.range.end {
// Count up to the next stripe_size boundary or end of range
let advance_by = self.distance_to_next_boundary(cursor);
// If this blocks in this stripe belong to us, add them to our count
if !self.shard_identity.is_key_disposable(&cursor) {
result += advance_by as u64;
}
cursor = cursor.add(advance_by);
}
if result > u32::MAX as u64 {
u32::MAX
} else {
result as u32
}
}
/// Advance the cursor to the next potential fragment boundary: this is either
/// a stripe boundary, or the end of the range.
fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
if self.shard_identity.count < ShardCount::new(2) {
// Optimization: don't bother stepping through stripes if the tenant isn't sharded.
return distance_to_range_end;
}
if cursor.field6 == 0xffffffff {
// We are wrapping from one relation's logical size to the next relation's first data block
return 1;
}
let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
let stripe_remainder = self.shard_identity.stripe_size.0
- (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
if cfg!(debug_assertions) {
// We should never overflow field5 and field6 -- our callers check this earlier
// and would have returned their u32::MAX cases if the input range violated this.
let next_cursor = cursor.add(stripe_remainder);
debug_assert!(
next_cursor.field1 == cursor.field1
&& next_cursor.field2 == cursor.field2
&& next_cursor.field3 == cursor.field3
&& next_cursor.field4 == cursor.field4
&& next_cursor.field5 == cursor.field5
)
}
std::cmp::min(stripe_remainder, distance_to_range_end)
}
/// Whereas `page_count` estimates the number of pages physically in this range on this shard,
/// this function simply calculates the number of pages in the space, without accounting for those
/// pages that would not actually be stored on this node.
///
/// Don't use this function in code that works with physical entities like layer files.
pub fn raw_size(range: &Range<Key>) -> u32 {
if is_contiguous_range(range) {
contiguous_range_len(range)
} else {
u32::MAX
}
}
}
impl KeySpace { impl KeySpace {
/// Create a key space with a single range. ///
pub fn single(key_range: Range<Key>) -> Self {
Self {
ranges: vec![key_range],
}
}
/// Partition a key space into roughly chunks of roughly 'target_size' bytes /// Partition a key space into roughly chunks of roughly 'target_size' bytes
/// in each partition. /// in each partition.
/// ///
pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning { pub fn partition(&self, target_size: u64) -> KeyPartitioning {
// Assume that each value is 8k in size. // Assume that each value is 8k in size.
let target_nblocks = (target_size / BLCKSZ as u64) as u32; let target_nblocks = (target_size / BLCKSZ as u64) as usize;
let mut parts = Vec::new(); let mut parts = Vec::new();
let mut current_part = Vec::new(); let mut current_part = Vec::new();
let mut current_part_size: usize = 0; let mut current_part_size: usize = 0;
for range in &self.ranges { for range in &self.ranges {
// While doing partitioning, wrap the range in ShardedRange so that our size calculations // If appending the next contiguous range in the keyspace to the current
// will respect shard striping rather than assuming all keys within a range are present. // partition would cause it to be too large, start a new partition.
let range = ShardedRange::new(range.clone(), shard_identity); let this_size = key_range_size(range) as usize;
if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
// Chunk up the range into parts that each contain up to target_size local blocks parts.push(KeySpace {
for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) { ranges: current_part,
// If appending the next contiguous range in the keyspace to the current });
// partition would cause it to be too large, and our current partition current_part = Vec::new();
// covers at least one block that is physically present in this shard, current_part_size = 0;
// then start a new partition
if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
&& current_part_size > 0
{
parts.push(KeySpace {
ranges: current_part,
});
current_part = Vec::new();
current_part_size = 0;
}
current_part.push(frag_range.start..frag_range.end);
current_part_size += frag_on_shard_size as usize;
} }
// If the next range is larger than 'target_size', split it into
// 'target_size' chunks.
let mut remain_size = this_size;
let mut start = range.start;
while remain_size > target_nblocks {
let next = start.add(target_nblocks as u32);
parts.push(KeySpace {
ranges: vec![start..next],
});
start = next;
remain_size -= target_nblocks
}
current_part.push(start..range.end);
current_part_size += remain_size;
} }
// add last partition that wasn't full yet. // add last partition that wasn't full yet.
@@ -302,10 +64,6 @@ impl KeySpace {
KeyPartitioning { parts } KeyPartitioning { parts }
} }
pub fn is_empty(&self) -> bool {
self.total_raw_size() == 0
}
/// Merge another keyspace into the current one. /// Merge another keyspace into the current one.
/// Note: the keyspaces must not ovelap (enforced via assertions) /// Note: the keyspaces must not ovelap (enforced via assertions)
pub fn merge(&mut self, other: &KeySpace) { pub fn merge(&mut self, other: &KeySpace) {
@@ -336,13 +94,12 @@ impl KeySpace {
/// Remove all keys in `other` from `self`. /// Remove all keys in `other` from `self`.
/// This can involve splitting or removing of existing ranges. /// This can involve splitting or removing of existing ranges.
/// Returns the removed keyspace pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
let (self_start, self_end) = match (self.start(), self.end()) { let (self_start, self_end) = match (self.start(), self.end()) {
(Some(start), Some(end)) => (start, end), (Some(start), Some(end)) => (start, end),
_ => { _ => {
// self is empty // self is empty
return KeySpace::default(); return;
} }
}; };
@@ -355,37 +112,30 @@ impl KeySpace {
.skip_while(|range| self_start >= range.end) .skip_while(|range| self_start >= range.end)
.take_while(|range| self_end > range.start); .take_while(|range| self_end > range.start);
let mut removed_accum = KeySpaceRandomAccum::new();
for range in other_ranges { for range in other_ranges {
while let Some(overlap_at) = self.overlaps_at(range) { while let Some(overlap_at) = self.overlaps_at(range) {
let overlapped = self.ranges[overlap_at].clone(); let overlapped = self.ranges[overlap_at].clone();
if overlapped.start < range.start && overlapped.end <= range.end { if overlapped.start < range.start && overlapped.end <= range.end {
// Higher part of the range is completely overlapped. // Higher part of the range is completely overlapped.
removed_accum.add_range(range.start..self.ranges[overlap_at].end);
self.ranges[overlap_at].end = range.start; self.ranges[overlap_at].end = range.start;
} }
if overlapped.start >= range.start && overlapped.end > range.end { if overlapped.start >= range.start && overlapped.end > range.end {
// Lower part of the range is completely overlapped. // Lower part of the range is completely overlapped.
removed_accum.add_range(self.ranges[overlap_at].start..range.end);
self.ranges[overlap_at].start = range.end; self.ranges[overlap_at].start = range.end;
} }
if overlapped.start < range.start && overlapped.end > range.end { if overlapped.start < range.start && overlapped.end > range.end {
// Middle part of the range is overlapped. // Middle part of the range is overlapped.
removed_accum.add_range(range.clone());
self.ranges[overlap_at].end = range.start; self.ranges[overlap_at].end = range.start;
self.ranges self.ranges
.insert(overlap_at + 1, range.end..overlapped.end); .insert(overlap_at + 1, range.end..overlapped.end);
} }
if overlapped.start >= range.start && overlapped.end <= range.end { if overlapped.start >= range.start && overlapped.end <= range.end {
// Whole range is overlapped // Whole range is overlapped
removed_accum.add_range(self.ranges[overlap_at].clone());
self.ranges.remove(overlap_at); self.ranges.remove(overlap_at);
} }
} }
} }
removed_accum.to_keyspace()
} }
pub fn start(&self) -> Option<Key> { pub fn start(&self) -> Option<Key> {
@@ -396,11 +146,11 @@ impl KeySpace {
self.ranges.last().map(|range| range.end) self.ranges.last().map(|range| range.end)
} }
/// The size of the keyspace in pages, before accounting for sharding #[allow(unused)]
pub fn total_raw_size(&self) -> usize { pub fn total_size(&self) -> usize {
self.ranges self.ranges
.iter() .iter()
.map(|range| ShardedRange::raw_size(range) as usize) .map(|range| key_range_size(range) as usize)
.sum() .sum()
} }
@@ -420,11 +170,6 @@ impl KeySpace {
pub fn overlaps(&self, range: &Range<Key>) -> bool { pub fn overlaps(&self, range: &Range<Key>) -> bool {
self.overlaps_at(range).is_some() self.overlaps_at(range).is_some()
} }
/// Check if the keyspace contains a key
pub fn contains(&self, key: &Key) -> bool {
self.overlaps(&(*key..key.next()))
}
} }
/// ///
@@ -439,33 +184,10 @@ pub struct KeyPartitioning {
pub parts: Vec<KeySpace>, pub parts: Vec<KeySpace>,
} }
/// Represents a partitioning of the sparse key space.
#[derive(Clone, Debug, Default)]
pub struct SparseKeyPartitioning {
pub parts: Vec<SparseKeySpace>,
}
impl KeyPartitioning { impl KeyPartitioning {
pub fn new() -> Self { pub fn new() -> Self {
KeyPartitioning { parts: Vec::new() } KeyPartitioning { parts: Vec::new() }
} }
/// Convert a key partitioning to a sparse partition.
pub fn into_sparse(self) -> SparseKeyPartitioning {
SparseKeyPartitioning {
parts: self.parts.into_iter().map(SparseKeySpace).collect(),
}
}
}
impl SparseKeyPartitioning {
/// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
/// cause long/dead loops.
pub fn into_dense(self) -> KeyPartitioning {
KeyPartitioning {
parts: self.parts.into_iter().map(|x| x.0).collect(),
}
}
} }
/// ///
@@ -497,7 +219,7 @@ impl KeySpaceAccum {
#[inline(always)] #[inline(always)]
pub fn add_range(&mut self, range: Range<Key>) { pub fn add_range(&mut self, range: Range<Key>) {
self.size += ShardedRange::raw_size(&range) as u64; self.size += key_range_size(&range) as u64;
match self.accum.as_mut() { match self.accum.as_mut() {
Some(accum) => { Some(accum) => {
@@ -529,9 +251,7 @@ impl KeySpaceAccum {
std::mem::take(self).to_keyspace() std::mem::take(self).to_keyspace()
} }
// The total number of keys in this object, ignoring any sharding effects that might cause some of pub fn size(&self) -> u64 {
// the keys to be omitted in storage on this shard.
pub fn raw_size(&self) -> u64 {
self.size self.size
} }
} }
@@ -587,19 +307,36 @@ impl KeySpaceRandomAccum {
} }
} }
#[inline(always)]
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
let start = key_range.start;
let end = key_range.end;
if end.field1 != start.field1
|| end.field2 != start.field2
|| end.field3 != start.field3
|| end.field4 != start.field4
{
return u32::MAX;
}
let start = (start.field5 as u64) << 32 | start.field6 as u64;
let end = (end.field5 as u64) << 32 | end.field6 as u64;
let diff = end - start;
if diff > u32::MAX as u64 {
u32::MAX
} else {
diff as u32
}
}
pub fn singleton_range(key: Key) -> Range<Key> { pub fn singleton_range(key: Key) -> Range<Key> {
key..key.next() key..key.next()
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use rand::{RngCore, SeedableRng};
use crate::{
models::ShardParameters,
shard::{ShardCount, ShardNumber},
};
use super::*; use super::*;
use std::fmt::Write; use std::fmt::Write;
@@ -642,17 +379,14 @@ mod tests {
accum.add_range(range.clone()); accum.add_range(range.clone());
} }
let expected_size: u64 = ranges let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
.iter() assert_eq!(accum.size(), expected_size);
.map(|r| ShardedRange::raw_size(r) as u64)
.sum();
assert_eq!(accum.raw_size(), expected_size);
assert_ks_eq(&accum.consume_keyspace(), ranges.clone()); assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
assert_eq!(accum.raw_size(), 0); assert_eq!(accum.size(), 0);
assert_ks_eq(&accum.consume_keyspace(), vec![]); assert_ks_eq(&accum.consume_keyspace(), vec![]);
assert_eq!(accum.raw_size(), 0); assert_eq!(accum.size(), 0);
for range in &ranges { for range in &ranges {
accum.add_range(range.clone()); accum.add_range(range.clone());
@@ -819,16 +553,7 @@ mod tests {
Key::from_i128(11)..Key::from_i128(13), Key::from_i128(11)..Key::from_i128(13),
], ],
}; };
let removed = key_space1.remove_overlapping_with(&key_space2); key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace {
ranges: vec![
Key::from_i128(2)..Key::from_i128(3),
Key::from_i128(6)..Key::from_i128(7),
Key::from_i128(11)..Key::from_i128(12),
],
};
assert_eq!(removed, removed_expected);
assert_eq!( assert_eq!(
key_space1.ranges, key_space1.ranges,
vec![ vec![
@@ -858,17 +583,7 @@ mod tests {
Key::from_i128(14)..Key::from_i128(17), Key::from_i128(14)..Key::from_i128(17),
], ],
}; };
key_space1.remove_overlapping_with(&key_space2);
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace {
ranges: vec![
Key::from_i128(3)..Key::from_i128(5),
Key::from_i128(8)..Key::from_i128(10),
Key::from_i128(14)..Key::from_i128(15),
],
};
assert_eq!(removed, removed_expected);
assert_eq!( assert_eq!(
key_space1.ranges, key_space1.ranges,
vec![ vec![
@@ -895,11 +610,7 @@ mod tests {
Key::from_i128(15)..Key::from_i128(17), Key::from_i128(15)..Key::from_i128(17),
], ],
}; };
key_space1.remove_overlapping_with(&key_space2);
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace::default();
assert_eq!(removed, removed_expected);
assert_eq!( assert_eq!(
key_space1.ranges, key_space1.ranges,
vec![ vec![
@@ -926,17 +637,7 @@ mod tests {
let key_space2 = KeySpace { let key_space2 = KeySpace {
ranges: vec![Key::from_i128(9)..Key::from_i128(19)], ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
}; };
key_space1.remove_overlapping_with(&key_space2);
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace {
ranges: vec![
Key::from_i128(9)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
Key::from_i128(17)..Key::from_i128(19),
],
};
assert_eq!(removed, removed_expected);
assert_eq!( assert_eq!(
key_space1.ranges, key_space1.ranges,
vec![ vec![
@@ -949,412 +650,4 @@ mod tests {
] ]
); );
} }
#[test]
fn sharded_range_relation_gap() {
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
},
&shard_identity,
);
// Key range spans relations, expect MAX
assert_eq!(range.page_count(), u32::MAX);
}
#[test]
fn shard_identity_keyspaces_single_key() {
let shard_identity = ShardIdentity::new(
ShardNumber(1),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
},
&shard_identity,
);
// Single-key range on logical size key
assert_eq!(range.page_count(), 1);
}
/// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
#[test]
fn contiguous_range_check() {
assert!(!is_contiguous_range(
&(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
),);
// The ranges goes all the way up to the 0xffffffff, including it: this is
// not considered a rel block range because 0xffffffff stores logical sizes,
// not blocks.
assert!(!is_contiguous_range(
&(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
),);
// Keys within the normal data region of a relation
assert!(is_contiguous_range(
&(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
),);
// The logical size key of one forkno, then some blocks in the next
assert!(is_contiguous_range(
&(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
),);
}
#[test]
fn shard_identity_keyspaces_forkno_gap() {
let shard_identity = ShardIdentity::new(
ShardNumber(1),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
},
&shard_identity,
);
// Range spanning the end of one forkno and the start of the next: we do not attempt to
// calculate a valid size, because we have no way to know if they keys between start
// and end are actually in use.
assert_eq!(range.page_count(), u32::MAX);
}
#[test]
fn shard_identity_keyspaces_one_relation() {
for shard_number in 0..4 {
let shard_identity = ShardIdentity::new(
ShardNumber(shard_number),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
},
&shard_identity,
);
// Very simple case: range covering block zero of one relation, where that block maps to shard zero
if shard_number == 0 {
assert_eq!(range.page_count(), 1);
} else {
// Other shards should perceive the range's size as zero
assert_eq!(range.page_count(), 0);
}
}
}
/// Test helper: construct a ShardedRange and call fragment() on it, returning
/// the total page count in the range and the fragments.
fn do_fragment(
range_start: Key,
range_end: Key,
shard_identity: &ShardIdentity,
target_nblocks: u32,
) -> (u32, Vec<(u32, Range<Key>)>) {
let range = ShardedRange::new(
Range {
start: range_start,
end: range_end,
},
shard_identity,
);
let page_count = range.page_count();
let fragments = range.fragment(target_nblocks);
// Invariant: we always get at least one fragment
assert!(!fragments.is_empty());
// Invariant: the first/last fragment start/end should equal the input start/end
assert_eq!(fragments.first().unwrap().1.start, range_start);
assert_eq!(fragments.last().unwrap().1.end, range_end);
if page_count > 0 {
// Invariant: every fragment must contain at least one shard-local page, if the
// total range contains at least one shard-local page
let all_nonzero = fragments.iter().all(|f| f.0 > 0);
if !all_nonzero {
eprintln!("Found a zero-length fragment: {:?}", fragments);
}
assert!(all_nonzero);
} else {
// A range with no shard-local pages should always be returned as a single fragment
assert_eq!(fragments, vec![(0, range_start..range_end)]);
}
// Invariant: fragments must be ordered and non-overlapping
let mut last: Option<Range<Key>> = None;
for frag in &fragments {
if let Some(last) = last {
assert!(frag.1.start >= last.end);
assert!(frag.1.start > last.start);
}
last = Some(frag.1.clone())
}
// Invariant: fragments respect target_nblocks
for frag in &fragments {
assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
}
(page_count, fragments)
}
/// Really simple tests for fragment(), on a range that just contains a single stripe
/// for a single tenant.
#[test]
fn sharded_range_fragment_simple() {
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
// A range which we happen to know covers exactly one stripe which belongs to this shard
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
// Ask for stripe_size blocks, we get the whole stripe
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 32768),
(32768, vec![(32768, input_start..input_end)])
);
// Ask for more, we still get the whole stripe
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 10000000),
(32768, vec![(32768, input_start..input_end)])
);
// Ask for target_nblocks of half the stripe size, we get two halves
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 16384),
(
32768,
vec![
(16384, input_start..input_start.add(16384)),
(16384, input_start.add(16384)..input_end)
]
)
);
}
#[test]
fn sharded_range_fragment_multi_stripe() {
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
// A range which covers multiple stripes, exactly one of which belongs to the current shard.
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
// Ask for all the blocks, get a fragment that covers the whole range but reports
// its size to be just the blocks belonging to our shard.
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 131072),
(32768, vec![(32768, input_start..input_end)])
);
// Ask for a sub-stripe quantity
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 16000),
(
32768,
vec![
(16000, input_start..input_start.add(16000)),
(16000, input_start.add(16000)..input_start.add(32000)),
(768, input_start.add(32000)..input_end),
]
)
);
// Try on a range that starts slightly after our owned stripe
assert_eq!(
do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
(32767, vec![(32767, input_start.add(1)..input_end)])
);
}
/// Test our calculations work correctly when we start a range from the logical size key of
/// a previous relation.
#[test]
fn sharded_range_fragment_starting_from_logical_size() {
let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
// Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x10000),
(0x8001, vec![(0x8001, input_start..input_end)])
);
// Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
// store all logical sizes)
let shard_identity = ShardIdentity::new(
ShardNumber(1),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x10000),
(0x1, vec![(0x1, input_start..input_end)])
);
}
/// Test that ShardedRange behaves properly when used on un-sharded data
#[test]
fn sharded_range_fragment_unsharded() {
let shard_identity = ShardIdentity::unsharded();
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x8000),
(
0x10000,
vec![
(0x8000, input_start..input_start.add(0x8000)),
(0x8000, input_start.add(0x8000)..input_start.add(0x10000))
]
)
);
}
#[test]
fn sharded_range_fragment_cross_relation() {
let shard_identity = ShardIdentity::unsharded();
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x8000),
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
);
// Same, but using a sharded identity
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x8000),
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
);
}
#[test]
fn sharded_range_fragment_tiny_nblocks() {
let shard_identity = ShardIdentity::unsharded();
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 16),
(
0x38,
vec![
(16, input_start..input_start.add(16)),
(16, input_start.add(16)..input_start.add(32)),
(16, input_start.add(32)..input_start.add(48)),
(8, input_start.add(48)..input_end),
]
)
);
}
#[test]
fn sharded_range_fragment_fuzz() {
// Use a fixed seed: we don't want to explicitly pick values, but we do want
// the test to be reproducible.
let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
for _i in 0..1000 {
let shard_identity = if prng.next_u32() % 2 == 0 {
ShardIdentity::unsharded()
} else {
let shard_count = prng.next_u32() % 127 + 1;
ShardIdentity::new(
ShardNumber((prng.next_u32() % shard_count) as u8),
ShardCount::new(shard_count as u8),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap()
};
let target_nblocks = prng.next_u32() % 65536 + 1;
let start_offset = prng.next_u32() % 16384;
// Try ranges up to 4GiB in size, that are always at least 1
let range_size = prng.next_u32() % 8192 + 1;
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
let input_start = Key::from_hex("000000067F00000001000004E10000000000")
.unwrap()
.add(start_offset);
let input_end = input_start.add(range_size);
// This test's main success conditions are the invariants baked into do_fragment
let (_total_size, fragments) =
do_fragment(input_start, input_end, &shard_identity, target_nblocks);
// Pick a random key within the range and check it appears in the output
let example_key = input_start.add(prng.next_u32() % range_size);
// Panic on unwrap if it isn't found
let example_key_frag = fragments
.iter()
.find(|f| f.1.contains(&example_key))
.unwrap();
// Check that the fragment containing our random key has a nonzero size if
// that key is shard-local
let example_key_local = !shard_identity.is_key_disposable(&example_key);
if example_key_local {
assert!(example_key_frag.0 > 0);
}
}
}
} }

View File

@@ -1,5 +1,6 @@
#![deny(unsafe_code)] #![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)] #![deny(clippy::undocumented_unsafe_blocks)]
use const_format::formatcp;
pub mod controller_api; pub mod controller_api;
pub mod key; pub mod key;
@@ -10,4 +11,7 @@ pub mod shard;
/// Public API types /// Public API types
pub mod upcall_api; pub mod upcall_api;
pub mod config; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");

View File

@@ -1,4 +1,3 @@
pub mod detach_ancestor;
pub mod partitioning; pub mod partitioning;
pub mod utilization; pub mod utilization;
@@ -9,8 +8,6 @@ use std::{
collections::HashMap, collections::HashMap,
io::{BufRead, Read}, io::{BufRead, Read},
num::{NonZeroU64, NonZeroUsize}, num::{NonZeroU64, NonZeroUsize},
str::FromStr,
sync::atomic::AtomicUsize,
time::{Duration, SystemTime}, time::{Duration, SystemTime},
}; };
@@ -23,7 +20,6 @@ use utils::{
history_buffer::HistoryBufferWithDropCounter, history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId}, id::{NodeId, TenantId, TimelineId},
lsn::Lsn, lsn::Lsn,
serde_system_time,
}; };
use crate::controller_api::PlacementPolicy; use crate::controller_api::PlacementPolicy;
@@ -305,107 +301,6 @@ pub struct TenantConfig {
pub heatmap_period: Option<String>, pub heatmap_period: Option<String>,
pub lazy_slru_download: Option<bool>, pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>, pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub switch_aux_file_policy: Option<AuxFilePolicy>,
}
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
/// tenant config. When the first aux file written, the policy will be persisted in the
/// `index_part.json` file and has a limited migration path.
///
/// Currently, we only allow the following migration path:
///
/// Unset -> V1
/// -> V2
/// -> CrossValidation -> V2
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AuxFilePolicy {
/// V1 aux file policy: store everything in AUX_FILE_KEY
V1,
/// V2 aux file policy: store in the AUX_FILE keyspace
V2,
/// Cross validation runs both formats on the write path and does validation
/// on the read path.
CrossValidation,
}
impl AuxFilePolicy {
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
matches!(
(from, to),
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
)
}
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
pub fn default_tenant_config() -> Self {
Self::V1
}
}
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
pub struct AtomicAuxFilePolicy(AtomicUsize);
impl AtomicAuxFilePolicy {
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
Self(AtomicUsize::new(
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
))
}
pub fn load(&self) -> Option<AuxFilePolicy> {
match self.0.load(std::sync::atomic::Ordering::Acquire) {
0 => None,
other => Some(AuxFilePolicy::from_usize(other)),
}
}
pub fn store(&self, policy: Option<AuxFilePolicy>) {
self.0.store(
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
std::sync::atomic::Ordering::Release,
);
}
}
impl AuxFilePolicy {
pub fn to_usize(self) -> usize {
match self {
Self::V1 => 1,
Self::CrossValidation => 2,
Self::V2 => 3,
}
}
pub fn try_from_usize(this: usize) -> Option<Self> {
match this {
1 => Some(Self::V1),
2 => Some(Self::CrossValidation),
3 => Some(Self::V2),
_ => None,
}
}
pub fn from_usize(this: usize) -> Self {
Self::try_from_usize(this).unwrap()
}
}
impl FromStr for AuxFilePolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.to_lowercase();
if s == "v1" {
Ok(Self::V1)
} else if s == "v2" {
Ok(Self::V2)
} else if s == "crossvalidation" || s == "cross_validation" {
Ok(Self::CrossValidation)
} else {
anyhow::bail!("cannot parse {} to aux file policy", s)
}
}
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -532,6 +427,7 @@ pub struct StatusResponse {
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
pub struct TenantLocationConfigRequest { pub struct TenantLocationConfigRequest {
pub tenant_id: Option<TenantShardId>,
#[serde(flatten)] #[serde(flatten)]
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
} }
@@ -680,9 +576,6 @@ pub struct TimelineInfo {
pub state: TimelineState, pub state: TimelineState,
pub walreceiver_status: String, pub walreceiver_status: String,
/// The last aux file policy being used on this timeline
pub last_aux_file_policy: Option<AuxFilePolicy>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -824,16 +717,6 @@ impl HistoricLayerInfo {
}; };
*field = value; *field = value;
} }
pub fn layer_file_size(&self) -> u64 {
match self {
HistoricLayerInfo::Delta {
layer_file_size, ..
} => *layer_file_size,
HistoricLayerInfo::Image {
layer_file_size, ..
} => *layer_file_size,
}
}
} }
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
@@ -841,16 +724,6 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
pub max_concurrent_downloads: NonZeroUsize, pub max_concurrent_downloads: NonZeroUsize,
} }
#[derive(Debug, Serialize, Deserialize)]
pub struct IngestAuxFilesRequest {
pub aux_files: HashMap<String, String>,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct ListAuxFilesRequest {
pub lsn: Lsn,
}
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
pub struct DownloadRemoteLayersTaskInfo { pub struct DownloadRemoteLayersTaskInfo {
pub task_id: String, pub task_id: String,
@@ -872,15 +745,10 @@ pub struct TimelineGcRequest {
pub gc_horizon: Option<u64>, pub gc_horizon: Option<u64>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerProcessStatus {
pub pid: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerStatus { pub struct WalRedoManagerStatus {
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>, pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
pub process: Option<WalRedoManagerProcessStatus>, pub pid: Option<u32>,
} }
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
@@ -889,7 +757,11 @@ pub struct WalRedoManagerStatus {
#[derive(Default, Debug, Serialize, Deserialize, Clone)] #[derive(Default, Debug, Serialize, Deserialize, Clone)]
pub struct SecondaryProgress { pub struct SecondaryProgress {
/// The remote storage LastModified time of the heatmap object we last downloaded. /// The remote storage LastModified time of the heatmap object we last downloaded.
pub heatmap_mtime: Option<serde_system_time::SystemTime>, #[serde(
serialize_with = "opt_ser_rfc3339_millis",
deserialize_with = "opt_deser_rfc3339_millis"
)]
pub heatmap_mtime: Option<SystemTime>,
/// The number of layers currently on-disk /// The number of layers currently on-disk
pub layers_downloaded: usize, pub layers_downloaded: usize,
@@ -902,64 +774,27 @@ pub struct SecondaryProgress {
pub bytes_total: u64, pub bytes_total: u64,
} }
#[derive(Serialize, Deserialize, Debug)] fn opt_ser_rfc3339_millis<S: serde::Serializer>(
pub struct TenantScanRemoteStorageShard { ts: &Option<SystemTime>,
pub tenant_shard_id: TenantShardId, serializer: S,
pub generation: Option<u32>, ) -> Result<S::Ok, S::Error> {
} match ts {
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
#[derive(Serialize, Deserialize, Debug, Default)] None => serializer.serialize_none(),
pub struct TenantScanRemoteStorageResponse {
pub shards: Vec<TenantScanRemoteStorageShard>,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "snake_case")]
pub enum TenantSorting {
ResidentSize,
MaxLogicalSize,
}
impl Default for TenantSorting {
fn default() -> Self {
Self::ResidentSize
} }
} }
#[derive(Serialize, Deserialize, Debug, Clone)] fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
pub struct TopTenantShardsRequest { where
// How would you like to sort the tenants? D: serde::de::Deserializer<'de>,
pub order_by: TenantSorting, {
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
// How many results? match s {
pub limit: usize, None => Ok(None),
Some(s) => humantime::parse_rfc3339(&s)
// Omit tenants with more than this many shards (e.g. if this is the max number of shards .map_err(serde::de::Error::custom)
// that the caller would ever split to) .map(Some),
pub where_shards_lt: Option<ShardCount>, }
// Omit tenants where the ordering metric is less than this (this is an optimization to
// let us quickly exclude numerous tiny shards)
pub where_gt: Option<u64>,
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
pub struct TopTenantShardItem {
pub id: TenantShardId,
/// Total size of layers on local disk for all timelines in this tenant
pub resident_size: u64,
/// Total size of layers in remote storage for all timelines in this tenant
pub physical_size: u64,
/// The largest logical size of a timeline within this tenant
pub max_logical_size: u64,
}
#[derive(Serialize, Deserialize, Debug, Default)]
pub struct TopTenantShardsResponse {
pub shards: Vec<TopTenantShardItem>,
} }
pub mod virtual_file { pub mod virtual_file {
@@ -1029,72 +864,39 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
} }
} }
// In the V2 protocol version, a GetPage request contains two LSN values:
//
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
// "get the latest version present". It's used by the primary server, which knows that no one else
// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
//
// not_modified_since: Hint to the pageserver that the client knows that the page has not been
// modified between 'not_modified_since' and the request LSN. It's always correct to set
// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
// request without waiting for 'request_lsn' to arrive.
//
// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
// standby to request a page at a particular non-latest LSN, and also include the
// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
// difference in the responses between V1 and V2.
//
// The Request structs below reflect the V2 interface. If V1 is used, the parse function
// maps the old format requests to the new format.
//
#[derive(Clone, Copy)]
pub enum PagestreamProtocolVersion {
V1,
V2,
}
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub struct PagestreamExistsRequest { pub struct PagestreamExistsRequest {
pub request_lsn: Lsn, pub latest: bool,
pub not_modified_since: Lsn, pub lsn: Lsn,
pub rel: RelTag, pub rel: RelTag,
} }
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub struct PagestreamNblocksRequest { pub struct PagestreamNblocksRequest {
pub request_lsn: Lsn, pub latest: bool,
pub not_modified_since: Lsn, pub lsn: Lsn,
pub rel: RelTag, pub rel: RelTag,
} }
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetPageRequest { pub struct PagestreamGetPageRequest {
pub request_lsn: Lsn, pub latest: bool,
pub not_modified_since: Lsn, pub lsn: Lsn,
pub rel: RelTag, pub rel: RelTag,
pub blkno: u32, pub blkno: u32,
} }
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub struct PagestreamDbSizeRequest { pub struct PagestreamDbSizeRequest {
pub request_lsn: Lsn, pub latest: bool,
pub not_modified_since: Lsn, pub lsn: Lsn,
pub dbnode: u32, pub dbnode: u32,
} }
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetSlruSegmentRequest { pub struct PagestreamGetSlruSegmentRequest {
pub request_lsn: Lsn, pub latest: bool,
pub not_modified_since: Lsn, pub lsn: Lsn,
pub kind: u8, pub kind: u8,
pub segno: u32, pub segno: u32,
} }
@@ -1141,16 +943,14 @@ pub struct TenantHistorySize {
} }
impl PagestreamFeMessage { impl PagestreamFeMessage {
/// Serialize a compute -> pageserver message. This is currently only used in testing
/// tools. Always uses protocol version 2.
pub fn serialize(&self) -> Bytes { pub fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new(); let mut bytes = BytesMut::new();
match self { match self {
Self::Exists(req) => { Self::Exists(req) => {
bytes.put_u8(0); bytes.put_u8(0);
bytes.put_u64(req.request_lsn.0); bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.not_modified_since.0); bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode); bytes.put_u32(req.rel.relnode);
@@ -1159,8 +959,8 @@ impl PagestreamFeMessage {
Self::Nblocks(req) => { Self::Nblocks(req) => {
bytes.put_u8(1); bytes.put_u8(1);
bytes.put_u64(req.request_lsn.0); bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.not_modified_since.0); bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode); bytes.put_u32(req.rel.relnode);
@@ -1169,8 +969,8 @@ impl PagestreamFeMessage {
Self::GetPage(req) => { Self::GetPage(req) => {
bytes.put_u8(2); bytes.put_u8(2);
bytes.put_u64(req.request_lsn.0); bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.not_modified_since.0); bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode); bytes.put_u32(req.rel.relnode);
@@ -1180,15 +980,15 @@ impl PagestreamFeMessage {
Self::DbSize(req) => { Self::DbSize(req) => {
bytes.put_u8(3); bytes.put_u8(3);
bytes.put_u64(req.request_lsn.0); bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.not_modified_since.0); bytes.put_u64(req.lsn.0);
bytes.put_u32(req.dbnode); bytes.put_u32(req.dbnode);
} }
Self::GetSlruSegment(req) => { Self::GetSlruSegment(req) => {
bytes.put_u8(4); bytes.put_u8(4);
bytes.put_u64(req.request_lsn.0); bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.not_modified_since.0); bytes.put_u64(req.lsn.0);
bytes.put_u8(req.kind); bytes.put_u8(req.kind);
bytes.put_u32(req.segno); bytes.put_u32(req.segno);
} }
@@ -1197,40 +997,18 @@ impl PagestreamFeMessage {
bytes.into() bytes.into()
} }
pub fn parse<R: std::io::Read>( pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
body: &mut R, // TODO these gets can fail
protocol_version: PagestreamProtocolVersion,
) -> anyhow::Result<PagestreamFeMessage> {
// these correspond to the NeonMessageTag enum in pagestore_client.h // these correspond to the NeonMessageTag enum in pagestore_client.h
// //
// TODO: consider using protobuf or serde bincode for less error prone // TODO: consider using protobuf or serde bincode for less error prone
// serialization. // serialization.
let msg_tag = body.read_u8()?; let msg_tag = body.read_u8()?;
let (request_lsn, not_modified_since) = match protocol_version {
PagestreamProtocolVersion::V2 => (
Lsn::from(body.read_u64::<BigEndian>()?),
Lsn::from(body.read_u64::<BigEndian>()?),
),
PagestreamProtocolVersion::V1 => {
// In the old protocol, each message starts with a boolean 'latest' flag,
// followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
// 'not_modified_since', used in the new protocol version.
let latest = body.read_u8()? != 0;
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
if latest {
(Lsn::MAX, request_lsn) // get latest version
} else {
(request_lsn, request_lsn) // get version at specified LSN
}
}
};
// The rest of the messages are the same between V1 and V2
match msg_tag { match msg_tag {
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
request_lsn, latest: body.read_u8()? != 0,
not_modified_since, lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag { rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?, spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?, dbnode: body.read_u32::<BigEndian>()?,
@@ -1239,8 +1017,8 @@ impl PagestreamFeMessage {
}, },
})), })),
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
request_lsn, latest: body.read_u8()? != 0,
not_modified_since, lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag { rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?, spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?, dbnode: body.read_u32::<BigEndian>()?,
@@ -1249,8 +1027,8 @@ impl PagestreamFeMessage {
}, },
})), })),
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn, latest: body.read_u8()? != 0,
not_modified_since, lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag { rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?, spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?, dbnode: body.read_u32::<BigEndian>()?,
@@ -1260,14 +1038,14 @@ impl PagestreamFeMessage {
blkno: body.read_u32::<BigEndian>()?, blkno: body.read_u32::<BigEndian>()?,
})), })),
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
request_lsn, latest: body.read_u8()? != 0,
not_modified_since, lsn: Lsn::from(body.read_u64::<BigEndian>()?),
dbnode: body.read_u32::<BigEndian>()?, dbnode: body.read_u32::<BigEndian>()?,
})), })),
4 => Ok(PagestreamFeMessage::GetSlruSegment( 4 => Ok(PagestreamFeMessage::GetSlruSegment(
PagestreamGetSlruSegmentRequest { PagestreamGetSlruSegmentRequest {
request_lsn, latest: body.read_u8()? != 0,
not_modified_since, lsn: Lsn::from(body.read_u64::<BigEndian>()?),
kind: body.read_u8()?, kind: body.read_u8()?,
segno: body.read_u32::<BigEndian>()?, segno: body.read_u32::<BigEndian>()?,
}, },
@@ -1395,8 +1173,8 @@ mod tests {
// Test serialization/deserialization of PagestreamFeMessage // Test serialization/deserialization of PagestreamFeMessage
let messages = vec![ let messages = vec![
PagestreamFeMessage::Exists(PagestreamExistsRequest { PagestreamFeMessage::Exists(PagestreamExistsRequest {
request_lsn: Lsn(4), latest: true,
not_modified_since: Lsn(3), lsn: Lsn(4),
rel: RelTag { rel: RelTag {
forknum: 1, forknum: 1,
spcnode: 2, spcnode: 2,
@@ -1405,8 +1183,8 @@ mod tests {
}, },
}), }),
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
request_lsn: Lsn(4), latest: false,
not_modified_since: Lsn(4), lsn: Lsn(4),
rel: RelTag { rel: RelTag {
forknum: 1, forknum: 1,
spcnode: 2, spcnode: 2,
@@ -1415,8 +1193,8 @@ mod tests {
}, },
}), }),
PagestreamFeMessage::GetPage(PagestreamGetPageRequest { PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn: Lsn(4), latest: true,
not_modified_since: Lsn(3), lsn: Lsn(4),
rel: RelTag { rel: RelTag {
forknum: 1, forknum: 1,
spcnode: 2, spcnode: 2,
@@ -1426,16 +1204,14 @@ mod tests {
blkno: 7, blkno: 7,
}), }),
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
request_lsn: Lsn(4), latest: true,
not_modified_since: Lsn(3), lsn: Lsn(4),
dbnode: 7, dbnode: 7,
}), }),
]; ];
for msg in messages { for msg in messages {
let bytes = msg.serialize(); let bytes = msg.serialize();
let reconstructed = let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
.unwrap();
assert!(msg == reconstructed); assert!(msg == reconstructed);
} }
} }
@@ -1594,59 +1370,4 @@ mod tests {
assert_eq!(actual, expected, "example on {line}"); assert_eq!(actual, expected, "example on {line}");
} }
} }
#[test]
fn test_aux_file_migration_path() {
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::V1
));
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::V2
));
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::CrossValidation
));
// Self-migration is not a valid migration path, and the caller should handle it by itself.
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::V2
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::CrossValidation
));
// Migrations not allowed
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::V2
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::CrossValidation
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::CrossValidation
));
// Migrations allowed
assert!(AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::V2
));
}
} }

View File

@@ -1,6 +0,0 @@
use utils::id::TimelineId;
#[derive(Default, serde::Serialize)]
pub struct AncestorDetached {
pub reparented_timelines: Vec<TimelineId>,
}

View File

@@ -1,11 +1,9 @@
use utils::lsn::Lsn; use utils::lsn::Lsn;
use crate::keyspace::SparseKeySpace;
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub struct Partitioning { pub struct Partitioning {
pub keys: crate::keyspace::KeySpace, pub keys: crate::keyspace::KeySpace,
pub sparse_keys: crate::keyspace::SparseKeySpace,
pub at_lsn: Lsn, pub at_lsn: Lsn,
} }
@@ -34,8 +32,6 @@ impl serde::Serialize for Partitioning {
let mut map = serializer.serialize_map(Some(2))?; let mut map = serializer.serialize_map(Some(2))?;
map.serialize_key("keys")?; map.serialize_key("keys")?;
map.serialize_value(&KeySpace(&self.keys))?; map.serialize_value(&KeySpace(&self.keys))?;
map.serialize_key("sparse_keys")?;
map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
map.serialize_key("at_lsn")?; map.serialize_key("at_lsn")?;
map.serialize_value(&WithDisplay(&self.at_lsn))?; map.serialize_value(&WithDisplay(&self.at_lsn))?;
map.end() map.end()
@@ -103,7 +99,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
#[derive(serde::Deserialize)] #[derive(serde::Deserialize)]
struct De { struct De {
keys: KeySpace, keys: KeySpace,
sparse_keys: KeySpace,
#[serde_as(as = "serde_with::DisplayFromStr")] #[serde_as(as = "serde_with::DisplayFromStr")]
at_lsn: Lsn, at_lsn: Lsn,
} }
@@ -112,7 +107,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
Ok(Self { Ok(Self {
at_lsn: de.at_lsn, at_lsn: de.at_lsn,
keys: de.keys.0, keys: de.keys.0,
sparse_keys: SparseKeySpace(de.sparse_keys.0),
}) })
} }
} }
@@ -139,12 +133,6 @@ mod tests {
"030000000000000000000000000000000003" "030000000000000000000000000000000003"
] ]
], ],
"sparse_keys": [
[
"620000000000000000000000000000000000",
"620000000000000000000000000000000003"
]
],
"at_lsn": "0/2240160" "at_lsn": "0/2240160"
} }
"#; "#;

View File

@@ -1,4 +1,4 @@
use utils::serde_system_time::SystemTime; use std::time::SystemTime;
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
/// the next tenant. /// the next tenant.
@@ -21,9 +21,28 @@ pub struct PageserverUtilization {
/// When was this snapshot captured, pageserver local time. /// When was this snapshot captured, pageserver local time.
/// ///
/// Use millis to give confidence that the value is regenerated often enough. /// Use millis to give confidence that the value is regenerated often enough.
#[serde(
serialize_with = "ser_rfc3339_millis",
deserialize_with = "deser_rfc3339_millis"
)]
pub captured_at: SystemTime, pub captured_at: SystemTime,
} }
fn ser_rfc3339_millis<S: serde::Serializer>(
ts: &SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
/// ///
/// Instead of newtype, use this because a newtype would get require handling deserializing values /// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -50,9 +69,7 @@ mod tests {
disk_usage_bytes: u64::MAX, disk_usage_bytes: u64::MAX,
free_space_bytes: 0, free_space_bytes: 0,
utilization_score: u64::MAX, utilization_score: u64::MAX,
captured_at: SystemTime( captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
),
}; };
let s = serde_json::to_string(&doc).unwrap(); let s = serde_json::to_string(&doc).unwrap();

View File

@@ -5,99 +5,21 @@ use crate::{
models::ShardParameters, models::ShardParameters,
}; };
use hex::FromHex; use hex::FromHex;
use postgres_ffi::relfile_utils::INIT_FORKNUM;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use utils::id::TenantId; use utils::id::TenantId;
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
///
/// This module contains a variety of types used to represent the concept of sharding
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
/// we provide an summary here.
///
/// Types used to describe shards:
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
/// a shard suffix.
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
/// tenant, such as layer files.
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
/// four hex digits. An unsharded tenant is `0000`.
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
///
/// Types used to describe the parameters for data distribution in a sharded tenant:
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
/// multiple shards. Its value is given in 8kiB pages.
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
/// always zero: this is provided for future upgrades that might introduce different
/// data distribution schemes.
///
/// Examples:
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
/// and their slugs are 0004, 0104, 0204, and 0304.
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardNumber(pub u8); pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardCount(u8); pub struct ShardCount(u8);
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
/// when we need to know which shard we're dealing with, but do not need to know the full
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
/// the fully qualified TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
/// and to check whether that [`ShardNumber`] is the same as the current shard.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub number: ShardNumber,
pub count: ShardCount,
pub stripe_size: ShardStripeSize,
layout: ShardLayout,
}
/// Formatting helper, for generating the `shard_id` label in traces.
struct ShardSlug<'a>(&'a TenantShardId);
/// TenantShardId globally identifies a particular shard in a particular tenant.
///
/// These are written as `<TenantId>-<ShardSlug>`, for example:
/// # The second shard in a two-shard tenant
/// 072f1291a5310026820b2fe4b2968934-0102
///
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
///
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
/// decoded as a TenantShardId, and when re-encoded it will be parseable
/// as a TenantId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardCount { impl ShardCount {
pub const MAX: Self = Self(u8::MAX); pub const MAX: Self = Self(u8::MAX);
/// The internal value of a ShardCount may be zero, which means "1 shard, but use /// The internal value of a ShardCount may be zero, which means "1 shard, but use
/// legacy format for TenantShardId that excludes the shard suffix", also known /// legacy format for TenantShardId that excludes the shard suffix", also known
/// as [`TenantShardId::unsharded`]. /// as `TenantShardId::unsharded`.
/// ///
/// This method returns the actual number of shards, i.e. if our internal value is /// This method returns the actual number of shards, i.e. if our internal value is
/// zero, we return 1 (unsharded tenants have 1 shard). /// zero, we return 1 (unsharded tenants have 1 shard).
@@ -116,16 +38,13 @@ impl ShardCount {
self.0 self.0
} }
/// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
/// uses the legacy format for `TenantShardId`. See also the documentation for
/// [`Self::count`].
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.0 == 0 self.0 == 0
} }
/// `v` may be zero, or the number of shards in the tenant. `v` is what /// `v` may be zero, or the number of shards in the tenant. `v` is what
/// [`Self::literal`] would return. /// [`Self::literal`] would return.
pub const fn new(val: u8) -> Self { pub fn new(val: u8) -> Self {
Self(val) Self(val)
} }
} }
@@ -134,6 +53,33 @@ impl ShardNumber {
pub const MAX: Self = Self(u8::MAX); pub const MAX: Self = Self(u8::MAX);
} }
/// TenantShardId identify the units of work for the Pageserver.
///
/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
///
/// # The second shard in a two-shard tenant
/// 072f1291a5310026820b2fe4b2968934-0102
///
/// Historically, tenants could not have multiple shards, and were identified
/// by TenantId. To support this, TenantShardId has a special legacy
/// mode where `shard_count` is equal to zero: this represents a single-sharded
/// tenant which should be written as a TenantId with no suffix.
///
/// The human-readable encoding of TenantShardId, such as used in API URLs,
/// is both forward and backward compatible: a legacy TenantId can be
/// decoded as a TenantShardId, and when re-encoded it will be parseable
/// as a TenantId.
///
/// Note that the binary encoding is _not_ backward compatible, because
/// at the time sharding is introduced, there are no existing binary structures
/// containing TenantId that we need to handle.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl TenantShardId { impl TenantShardId {
pub fn unsharded(tenant_id: TenantId) -> Self { pub fn unsharded(tenant_id: TenantId) -> Self {
Self { Self {
@@ -165,13 +111,10 @@ impl TenantShardId {
} }
/// Convenience for code that has special behavior on the 0th shard. /// Convenience for code that has special behavior on the 0th shard.
pub fn is_shard_zero(&self) -> bool { pub fn is_zero(&self) -> bool {
self.shard_number == ShardNumber(0) self.shard_number == ShardNumber(0)
} }
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
} }
@@ -207,6 +150,9 @@ impl TenantShardId {
} }
} }
/// Formatting helper
struct ShardSlug<'a>(&'a TenantShardId);
impl<'a> std::fmt::Display for ShardSlug<'a> { impl<'a> std::fmt::Display for ShardSlug<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!( write!(
@@ -276,6 +222,16 @@ impl From<[u8; 18]> for TenantShardId {
} }
} }
/// For use within the context of a particular tenant, when we need to know which
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
/// TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardIndex { impl ShardIndex {
pub fn new(number: ShardNumber, count: ShardCount) -> Self { pub fn new(number: ShardNumber, count: ShardCount) -> Self {
Self { Self {
@@ -290,9 +246,6 @@ impl ShardIndex {
} }
} }
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
} }
@@ -360,8 +313,6 @@ impl Serialize for TenantShardId {
if serializer.is_human_readable() { if serializer.is_human_readable() {
serializer.collect_str(self) serializer.collect_str(self)
} else { } else {
// Note: while human encoding of [`TenantShardId`] is backward and forward
// compatible, this binary encoding is not.
let mut packed: [u8; 18] = [0; 18]; let mut packed: [u8; 18] = [0; 18];
packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
packed[16] = self.shard_number.0; packed[16] = self.shard_number.0;
@@ -439,6 +390,16 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
/// Default stripe size in pages: 256MiB divided by 8kiB page size. /// Default stripe size in pages: 256MiB divided by 8kiB page size.
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
/// The ShardIdentity contains the information needed for one member of map
/// to resolve a key to a shard, and then check whether that shard is ==self.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub number: ShardNumber,
pub count: ShardCount,
pub stripe_size: ShardStripeSize,
layout: ShardLayout,
}
#[derive(thiserror::Error, Debug, PartialEq, Eq)] #[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum ShardConfigError { pub enum ShardConfigError {
#[error("Invalid shard count")] #[error("Invalid shard count")]
@@ -453,7 +414,7 @@ impl ShardIdentity {
/// An identity with number=0 count=0 is a "none" identity, which represents legacy /// An identity with number=0 count=0 is a "none" identity, which represents legacy
/// tenants. Modern single-shard tenants should not use this: they should /// tenants. Modern single-shard tenants should not use this: they should
/// have number=0 count=1. /// have number=0 count=1.
pub const fn unsharded() -> Self { pub fn unsharded() -> Self {
Self { Self {
number: ShardNumber(0), number: ShardNumber(0),
count: ShardCount(0), count: ShardCount(0),
@@ -478,9 +439,6 @@ impl ShardIdentity {
} }
} }
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.number == ShardNumber(0) && self.count == ShardCount(0) self.number == ShardNumber(0) && self.count == ShardCount(0)
} }
@@ -529,8 +487,6 @@ impl ShardIdentity {
} }
/// Return true if the key should be ingested by this shard /// Return true if the key should be ingested by this shard
///
/// Shards must ingest _at least_ keys which return true from this check.
pub fn is_key_local(&self, key: &Key) -> bool { pub fn is_key_local(&self, key: &Key) -> bool {
assert!(!self.is_broken()); assert!(!self.is_broken());
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -541,9 +497,7 @@ impl ShardIdentity {
} }
/// Return true if the key should be discarded if found in this shard's /// Return true if the key should be discarded if found in this shard's
/// data store, e.g. during compaction after a split. /// data store, e.g. during compaction after a split
///
/// Shards _may_ drop keys which return false here, but are not obliged to.
pub fn is_key_disposable(&self, key: &Key) -> bool { pub fn is_key_disposable(&self, key: &Key) -> bool {
if key_is_shard0(key) { if key_is_shard0(key) {
// Q: Why can't we dispose of shard0 content if we're not shard 0? // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -569,7 +523,7 @@ impl ShardIdentity {
/// Convenience for checking if this identity is the 0th shard in a tenant, /// Convenience for checking if this identity is the 0th shard in a tenant,
/// for special cases on shard 0 such as ingesting relation sizes. /// for special cases on shard 0 such as ingesting relation sizes.
pub fn is_shard_zero(&self) -> bool { pub fn is_zero(&self) -> bool {
self.number == ShardNumber(0) self.number == ShardNumber(0)
} }
} }
@@ -652,13 +606,7 @@ fn key_is_shard0(key: &Key) -> bool {
// relation pages are distributed to shards other than shard zero. Everything else gets // relation pages are distributed to shards other than shard zero. Everything else gets
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup // stored on shard 0. This guarantees that shard 0 can independently serve basebackup
// requests, and any request other than those for particular blocks in relations. // requests, and any request other than those for particular blocks in relations.
// !is_rel_block_key(key)
// The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
// type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
// because they must be included in basebackups.
let is_initfork = key.field5 == INIT_FORKNUM;
!is_rel_block_key(key) || is_initfork
} }
/// Provide the same result as the function in postgres `hashfn.h` with the same name /// Provide the same result as the function in postgres `hashfn.h` with the same name

View File

@@ -118,9 +118,7 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
// Likewise for these, although the assumption that these don't change is a little more iffy. // Likewise for these, although the assumption that these don't change is a little more iffy.
pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
pub use v14::bindings::{PageHeaderData, XLogRecord}; pub use v14::bindings::{PageHeaderData, XLogRecord};
pub use v14::xlog_utils::{ pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
};
pub use v14::bindings::{CheckPoint, ControlFileData}; pub use v14::bindings::{CheckPoint, ControlFileData};

View File

@@ -331,10 +331,7 @@ impl CheckPoint {
/// Returns 'true' if the XID was updated. /// Returns 'true' if the XID was updated.
pub fn update_next_xid(&mut self, xid: u32) -> bool { pub fn update_next_xid(&mut self, xid: u32) -> bool {
// nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround. // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
let mut new_xid = std::cmp::max( let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
xid.wrapping_add(1),
pg_constants::FIRST_NORMAL_TRANSACTION_ID,
);
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
new_xid = new_xid =
@@ -370,16 +367,8 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE); let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
let first_page_only = seg_off < XLOG_BLCKSZ; let first_page_only = seg_off < XLOG_BLCKSZ;
// If first records starts in the middle of the page, pretend in page header let (shdr_rem_len, infoflags) = if first_page_only {
// there is a fake record which ends where first real record starts. This (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
// makes pg_waldump etc happy.
let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 {
assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD);
// xlp_rem_len doesn't include page header, hence the subtraction.
(
seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD,
pg_constants::XLP_FIRST_IS_CONTRECORD,
)
} else { } else {
(0, 0) (0, 0)
}; };
@@ -408,22 +397,20 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
if !first_page_only { if !first_page_only {
let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize; let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
// see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len.
let (xlp_rem_len, xlp_info) = if page_off > 0 {
assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
(
(page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32,
pg_constants::XLP_FIRST_IS_CONTRECORD,
)
} else {
(0, 0)
};
let header = XLogPageHeaderData { let header = XLogPageHeaderData {
xlp_magic: XLOG_PAGE_MAGIC as u16, xlp_magic: XLOG_PAGE_MAGIC as u16,
xlp_info, xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
pg_constants::XLP_FIRST_IS_CONTRECORD
} else {
0
},
xlp_tli: PG_TLI, xlp_tli: PG_TLI,
xlp_pageaddr: lsn.page_lsn().0, xlp_pageaddr: lsn.page_lsn().0,
xlp_rem_len, xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
page_off as u32
} else {
0u32
},
..Default::default() // Put 0 in padding fields. ..Default::default() // Put 0 in padding fields.
}; };
let hdr_bytes = header.encode()?; let hdr_bytes = header.encode()?;

View File

@@ -4,9 +4,7 @@ use log::*;
use postgres::types::PgLsn; use postgres::types::PgLsn;
use postgres::Client; use postgres::Client;
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
use postgres_ffi::{ use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::process::Command; use std::process::Command;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
@@ -264,21 +262,11 @@ fn craft_internal<C: postgres::GenericClient>(
intermediate_lsns.insert(0, initial_lsn); intermediate_lsns.insert(0, initial_lsn);
} }
// Some records may be not flushed, e.g. non-transactional logical messages. Flush now. // Some records may be not flushed, e.g. non-transactional logical messages.
// //
// If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
// returns the position just after the page header on the next page. That's where the next // because pg_current_wal_insert_lsn skips page headers.
// record will be inserted. But the page header hasn't actually been written to the WAL client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
// yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
// error. Because of that, if the insert location is just after a page header, back off to
// previous page boundary.
let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
} else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
}
client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
Ok(intermediate_lsns) Ok(intermediate_lsns)
} }
@@ -332,49 +320,38 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
client.execute("CREATE table t(x int)", &[])?; client.execute("CREATE table t(x int)", &[])?;
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. We // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
// will use carefully-sized logical messages to advance WAL insert location such // We will use logical message as the padding. We start with detecting how much WAL
// that there is just enough space on the page for the XLOG_SWITCH record. // it takes for one logical message, considering all alignments and headers.
loop { let base_wal_advance = {
// We start with measuring how much WAL it takes for one logical message,
// considering all alignments and headers.
let before_lsn = client.pg_current_wal_insert_lsn()?; let before_lsn = client.pg_current_wal_insert_lsn()?;
// Small non-empty message bigger than few bytes is more likely than an empty
// message to have the same format as the big padding message.
client.execute( client.execute(
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
&[], &[],
)?; )?;
let after_lsn = client.pg_current_wal_insert_lsn()?; // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
(u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
// Did the record cross a page boundary? If it did, start over. Crossing a + XLOG_SIZE_OF_XLOG_RECORD
// page boundary adds to the apparent size of the record because of the page };
// header, which throws off the calculation. let mut remaining_lsn =
if u64::from(before_lsn) / XLOG_BLCKSZ as u64 XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
!= u64::from(after_lsn) / XLOG_BLCKSZ as u64 if remaining_lsn < base_wal_advance {
{ remaining_lsn += XLOG_BLCKSZ;
continue;
}
// base_size is the size of a logical message without the payload
let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
// Is there enough space on the page for another logical message and an
// XLOG_SWITCH? If not, start over.
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
continue;
}
// We will write another logical message, such that after the logical message
// record, there will be space for exactly one XLOG_SWITCH. How large should
// the logical message's payload be? An XLOG_SWITCH record has no data => its
// size is exactly XLOG_SIZE_OF_XLOG_RECORD.
let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
client.execute(
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
&[&(repeats as i32)],
)?;
break;
} }
let repeats = 10 + remaining_lsn - base_wal_advance;
info!(
"current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
client.pg_current_wal_insert_lsn()?,
remaining_lsn,
base_wal_advance,
repeats
);
client.execute(
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
&[&(repeats as i32)],
)?;
info!( info!(
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
client.pg_current_wal_insert_lsn()?, client.pg_current_wal_insert_lsn()?,

View File

@@ -38,7 +38,6 @@ azure_storage_blobs.workspace = true
futures-util.workspace = true futures-util.workspace = true
http-types.workspace = true http-types.workspace = true
itertools.workspace = true itertools.workspace = true
sync_wrapper = { workspace = true, features = ["futures"] }
[dev-dependencies] [dev-dependencies]
camino-tempfile.workspace = true camino-tempfile.workspace = true

View File

@@ -3,7 +3,6 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::env; use std::env;
use std::io;
use std::num::NonZeroU32; use std::num::NonZeroU32;
use std::pin::Pin; use std::pin::Pin;
use std::str::FromStr; use std::str::FromStr;
@@ -21,7 +20,6 @@ use azure_storage_blobs::blob::CopyStatus;
use azure_storage_blobs::prelude::ClientBuilder; use azure_storage_blobs::prelude::ClientBuilder;
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
use bytes::Bytes; use bytes::Bytes;
use futures::future::Either;
use futures::stream::Stream; use futures::stream::Stream;
use futures_util::StreamExt; use futures_util::StreamExt;
use futures_util::TryStreamExt; use futures_util::TryStreamExt;
@@ -29,7 +27,6 @@ use http_types::{StatusCode, Url};
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use tracing::debug; use tracing::debug;
use crate::RemoteStorageActivity;
use crate::{ use crate::{
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download, error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -131,12 +128,12 @@ impl AzureBlobStorage {
let kind = RequestKind::Get; let kind = RequestKind::Get;
let _permit = self.permit(kind, cancel).await?; let _permit = self.permit(kind, cancel).await?;
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
let mut etag = None; let mut etag = None;
let mut last_modified = None; let mut last_modified = None;
let mut metadata = HashMap::new(); let mut metadata = HashMap::new();
// TODO give proper streaming response instead of buffering into RAM
// https://github.com/neondatabase/neon/issues/5563
let download = async { let download = async {
let response = builder let response = builder
@@ -155,46 +152,39 @@ impl AzureBlobStorage {
Err(_elapsed) => Err(DownloadError::Timeout), Err(_elapsed) => Err(DownloadError::Timeout),
}); });
let mut response = Box::pin(response); let mut response = std::pin::pin!(response);
let Some(part) = response.next().await else { let mut bufs = Vec::new();
while let Some(part) = response.next().await {
let part = part?;
if etag.is_none() {
etag = Some(part.blob.properties.etag);
}
if last_modified.is_none() {
last_modified = Some(part.blob.properties.last_modified.into());
}
if let Some(blob_meta) = part.blob.metadata {
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
}
let data = part
.data
.collect()
.await
.map_err(|e| DownloadError::Other(e.into()))?;
bufs.push(data);
}
if bufs.is_empty() {
return Err(DownloadError::Other(anyhow::anyhow!( return Err(DownloadError::Other(anyhow::anyhow!(
"Azure GET response contained no response body" "Azure GET response contained no buffers"
))); )));
};
let part = part?;
if etag.is_none() {
etag = Some(part.blob.properties.etag);
} }
if last_modified.is_none() {
last_modified = Some(part.blob.properties.last_modified.into());
}
if let Some(blob_meta) = part.blob.metadata {
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
}
// unwrap safety: if these were None, bufs would be empty and we would have returned an error already // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
let etag = etag.unwrap(); let etag = etag.unwrap();
let last_modified = last_modified.unwrap(); let last_modified = last_modified.unwrap();
let tail_stream = response
.map(|part| match part {
Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))),
Err(e) => {
Either::Right(futures::stream::once(async { Err(io::Error::other(e)) }))
}
})
.flatten();
let stream = part
.data
.map(|r| r.map_err(io::Error::other))
.chain(sync_wrapper::SyncStream::new(tail_stream));
//.chain(SyncStream::from_pin(Box::pin(tail_stream)));
let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream);
Ok(Download { Ok(Download {
download_stream: Box::pin(download_stream), download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
etag, etag,
last_modified, last_modified,
metadata: Some(StorageMetadata(metadata)), metadata: Some(StorageMetadata(metadata)),
@@ -203,10 +193,7 @@ impl AzureBlobStorage {
tokio::select! { tokio::select! {
bufs = download => bufs, bufs = download => bufs,
cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout { _ = cancel.cancelled() => Err(DownloadError::Cancelled),
TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
},
} }
} }
@@ -526,10 +513,6 @@ impl RemoteStorage for AzureBlobStorage {
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
Err(TimeTravelError::Unimplemented) Err(TimeTravelError::Unimplemented)
} }
fn activity(&self) -> RemoteStorageActivity {
self.concurrency_limiter.activity()
}
} }
pin_project_lite::pin_project! { pin_project_lite::pin_project! {

View File

@@ -21,13 +21,11 @@ use std::{
fmt::Debug, fmt::Debug,
num::{NonZeroU32, NonZeroUsize}, num::{NonZeroU32, NonZeroUsize},
pin::Pin, pin::Pin,
str::FromStr,
sync::Arc, sync::Arc,
time::{Duration, SystemTime}, time::{Duration, SystemTime},
}; };
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use aws_sdk_s3::types::StorageClass;
use camino::{Utf8Path, Utf8PathBuf}; use camino::{Utf8Path, Utf8PathBuf};
use bytes::Bytes; use bytes::Bytes;
@@ -55,11 +53,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/> /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
/// Set this limit analogously to the S3 limit /// We set this a little bit low as we currently buffer the entire file into RAM
/// ///
/// Here, a limit of max 20k concurrent connections was noted. /// Here, a limit of max 20k concurrent connections was noted.
/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections> /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100; pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
/// No limits on the client side, which currenltly means 1000 for AWS S3. /// No limits on the client side, which currenltly means 1000 for AWS S3.
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax> /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None; pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
@@ -136,11 +134,6 @@ impl RemotePath {
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> { pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
self.0.strip_prefix(&p.0) self.0.strip_prefix(&p.0)
} }
pub fn add_trailing_slash(&self) -> Self {
// Unwrap safety inputs are guararnteed to be valid UTF-8
Self(format!("{}/", self.0).try_into().unwrap())
}
} }
/// We don't need callers to be able to pass arbitrary delimiters: just control /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -164,21 +157,47 @@ pub struct Listing {
/// providing basic CRUD operations for storage files. /// providing basic CRUD operations for storage files.
#[allow(async_fn_in_trait)] #[allow(async_fn_in_trait)]
pub trait RemoteStorage: Send + Sync + 'static { pub trait RemoteStorage: Send + Sync + 'static {
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2. /// Lists all top level subdirectories for a given prefix
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`) /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
/// /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not /// so this method doesnt need to.
/// from the absolute root of the bucket. async fn list_prefixes(
/// &self,
/// `mode` configures whether to use a delimiter. Without a delimiter all keys prefix: Option<&RemotePath>,
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of cancel: &CancellationToken,
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are ) -> Result<Vec<RemotePath>, DownloadError> {
/// returned in `keys` (). let result = self
/// .list(prefix, ListingMode::WithDelimiter, None, cancel)
/// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function .await?
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on .prefixes;
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure. Ok(result)
}
/// Lists all files in directory "recursively"
/// (not really recursively, because AWS has a flat namespace)
/// Note: This is subtely different than list_prefixes,
/// because it is for listing files instead of listing
/// names sharing common prefixes.
/// For example,
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
/// whereas,
/// list_prefixes("foo/bar/") = ["cat", "dog"]
/// See `test_real_s3.rs` for more details.
/// ///
/// max_keys limits max number of keys returned; None means unlimited.
async fn list_files(
&self,
prefix: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
let result = self
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
.await?
.keys;
Ok(result)
}
async fn list( async fn list(
&self, &self,
prefix: Option<&RemotePath>, prefix: Option<&RemotePath>,
@@ -263,17 +282,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
done_if_after: SystemTime, done_if_after: SystemTime,
cancel: &CancellationToken, cancel: &CancellationToken,
) -> Result<(), TimeTravelError>; ) -> Result<(), TimeTravelError>;
/// Query how busy we currently are: may be used by callers which wish to politely
/// back off if there are already a lot of operations underway.
fn activity(&self) -> RemoteStorageActivity;
}
pub struct RemoteStorageActivity {
pub read_available: usize,
pub read_total: usize,
pub write_available: usize,
pub write_total: usize,
} }
/// DownloadStream is sensitive to the timeout and cancellation used with the original /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -328,6 +336,41 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
} }
} }
// A function for listing all the files in a "directory"
// Example:
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
//
// max_keys limits max number of keys returned; None means unlimited.
pub async fn list_files(
&self,
folder: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
match self {
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
}
}
// lists common *prefixes*, if any of files
// Example:
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
pub async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
match self {
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
}
}
/// See [`RemoteStorage::upload`] /// See [`RemoteStorage::upload`]
pub async fn upload( pub async fn upload(
&self, &self,
@@ -455,15 +498,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
} }
} }
} }
pub fn activity(&self) -> RemoteStorageActivity {
match self {
Self::LocalFs(s) => s.activity(),
Self::AwsS3(s) => s.activity(),
Self::AzureBlob(s) => s.activity(),
Self::Unreliable(s) => s.activity(),
}
}
} }
impl GenericRemoteStorage { impl GenericRemoteStorage {
@@ -531,16 +565,6 @@ impl GenericRemoteStorage {
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct StorageMetadata(HashMap<String, String>); pub struct StorageMetadata(HashMap<String, String>);
impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
fn from(arr: [(&str, &str); N]) -> Self {
let map: HashMap<String, String> = arr
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect();
Self(map)
}
}
/// External backup storage configuration, enough for creating a client for that storage. /// External backup storage configuration, enough for creating a client for that storage.
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct RemoteStorageConfig { pub struct RemoteStorageConfig {
@@ -585,7 +609,6 @@ pub struct S3Config {
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
pub concurrency_limit: NonZeroUsize, pub concurrency_limit: NonZeroUsize,
pub max_keys_per_list_response: Option<i32>, pub max_keys_per_list_response: Option<i32>,
pub upload_storage_class: Option<StorageClass>,
} }
impl Debug for S3Config { impl Debug for S3Config {
@@ -714,18 +737,6 @@ impl RemoteStorageConfig {
endpoint, endpoint,
concurrency_limit, concurrency_limit,
max_keys_per_list_response, max_keys_per_list_response,
upload_storage_class: toml
.get("upload_storage_class")
.map(|prefix_in_bucket| -> anyhow::Result<_> {
let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
let storage_class = StorageClass::from_str(&s).expect("infallible");
#[allow(deprecated)]
if matches!(storage_class, StorageClass::Unknown(_)) {
bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
}
Ok(storage_class)
})
.transpose()?,
}) })
} }
(_, _, _, Some(_), None) => { (_, _, _, Some(_), None) => {
@@ -794,9 +805,6 @@ struct ConcurrencyLimiter {
// The helps to ensure we don't exceed the thresholds. // The helps to ensure we don't exceed the thresholds.
write: Arc<Semaphore>, write: Arc<Semaphore>,
read: Arc<Semaphore>, read: Arc<Semaphore>,
write_total: usize,
read_total: usize,
} }
impl ConcurrencyLimiter { impl ConcurrencyLimiter {
@@ -825,21 +833,10 @@ impl ConcurrencyLimiter {
Arc::clone(self.for_kind(kind)).acquire_owned().await Arc::clone(self.for_kind(kind)).acquire_owned().await
} }
fn activity(&self) -> RemoteStorageActivity {
RemoteStorageActivity {
read_available: self.read.available_permits(),
read_total: self.read_total,
write_available: self.write.available_permits(),
write_total: self.write_total,
}
}
fn new(limit: usize) -> ConcurrencyLimiter { fn new(limit: usize) -> ConcurrencyLimiter {
Self { Self {
read: Arc::new(Semaphore::new(limit)), read: Arc::new(Semaphore::new(limit)),
write: Arc::new(Semaphore::new(limit)), write: Arc::new(Semaphore::new(limit)),
read_total: limit,
write_total: limit,
} }
} }
} }

View File

@@ -5,9 +5,11 @@
//! volume is mounted to the local FS. //! volume is mounted to the local FS.
use std::{ use std::{
collections::HashSet, borrow::Cow,
future::Future,
io::ErrorKind, io::ErrorKind,
num::NonZeroU32, num::NonZeroU32,
pin::Pin,
time::{Duration, SystemTime, UNIX_EPOCH}, time::{Duration, SystemTime, UNIX_EPOCH},
}; };
@@ -20,11 +22,11 @@ use tokio::{
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
}; };
use tokio_util::{io::ReaderStream, sync::CancellationToken}; use tokio_util::{io::ReaderStream, sync::CancellationToken};
use utils::crashsafe::path_with_suffix_extension; use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
use crate::{ use crate::{
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity, Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
}; };
use super::{RemoteStorage, StorageMetadata}; use super::{RemoteStorage, StorageMetadata};
@@ -91,47 +93,7 @@ impl LocalFs {
#[cfg(test)] #[cfg(test)]
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> { async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
use std::{future::Future, pin::Pin}; Ok(get_all_files(&self.storage_root, true)
fn get_all_files<'a, P>(
directory_path: P,
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
where
P: AsRef<Utf8Path> + Send + Sync + 'a,
{
Box::pin(async move {
let directory_path = directory_path.as_ref();
if directory_path.exists() {
if directory_path.is_dir() {
let mut paths = Vec::new();
let mut dir_contents = fs::read_dir(directory_path).await?;
while let Some(dir_entry) = dir_contents.next_entry().await? {
let file_type = dir_entry.file_type().await?;
let entry_path =
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
anyhow::Error::msg(format!(
"non-Unicode path: {}",
pb.to_string_lossy()
))
})?;
if file_type.is_symlink() {
tracing::debug!("{entry_path:?} is a symlink, skipping")
} else if file_type.is_dir() {
paths.extend(get_all_files(&entry_path).await?.into_iter())
} else {
paths.push(entry_path);
}
}
Ok(paths)
} else {
bail!("Path {directory_path:?} is not a directory")
}
} else {
Ok(Vec::new())
}
})
}
Ok(get_all_files(&self.storage_root)
.await? .await?
.into_iter() .into_iter()
.map(|path| { .map(|path| {
@@ -158,14 +120,6 @@ impl LocalFs {
// S3 object list prefixes can be arbitrary strings, but when reading // S3 object list prefixes can be arbitrary strings, but when reading
// the local filesystem we need a directory to start calling read_dir on. // the local filesystem we need a directory to start calling read_dir on.
let mut initial_dir = full_path.clone(); let mut initial_dir = full_path.clone();
// If there's no trailing slash, we have to start looking from one above: even if
// `initial_dir` is a directory, we should still list any prefixes in the parent
// that start with the same string.
if !full_path.to_string().ends_with('/') {
initial_dir.pop();
}
loop { loop {
// Did we make it to the root? // Did we make it to the root?
if initial_dir.parent().is_none() { if initial_dir.parent().is_none() {
@@ -341,66 +295,61 @@ impl RemoteStorage for LocalFs {
let op = async { let op = async {
let mut result = Listing::default(); let mut result = Listing::default();
// Filter out directories: in S3 directories don't exist, only the keys within them do. if let ListingMode::NoDelimiter = mode {
let keys = self let keys = self
.list_recursive(prefix) .list_recursive(prefix)
.await
.map_err(DownloadError::Other)?;
result.keys = keys
.into_iter()
.filter(|k| {
let path = k.with_base(&self.storage_root);
!path.is_dir()
})
.collect();
if let Some(max_keys) = max_keys {
result.keys.truncate(max_keys.get() as usize);
}
return Ok(result);
}
let path = match prefix {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root),
};
let prefixes_to_filter = get_all_files(path.as_ref(), false)
.await .await
.map_err(DownloadError::Other)?; .map_err(DownloadError::Other)?;
let keys = keys
.into_iter()
.filter(|k| {
let path = k.with_base(&self.storage_root);
!path.is_dir()
})
.collect();
if let ListingMode::NoDelimiter = mode { // filter out empty directories to mirror s3 behavior.
result.keys = keys; for prefix in prefixes_to_filter {
} else { if prefix.is_dir()
let mut prefixes = HashSet::new(); && is_directory_empty(&prefix)
for key in keys { .await
// If the part after the prefix includes a "/", take only the first part and put it in `prefixes`. .map_err(DownloadError::Other)?
let relative_key = if let Some(prefix) = prefix { {
let mut prefix = prefix.clone(); continue;
// We only strip the dirname of the prefix, so that when we strip it from the start of keys we }
// end up with full file/dir names.
let prefix_full_local_path = prefix.with_base(&self.storage_root); let stripped = prefix
let has_slash = prefix.0.to_string().ends_with('/'); .strip_prefix(&self.storage_root)
let strip_prefix = if prefix_full_local_path.is_dir() && has_slash { .context("Failed to strip prefix")
prefix .and_then(RemotePath::new)
} else { .expect(
prefix.0.pop(); "We list files for storage root, hence should be able to remote the prefix",
prefix );
};
if prefix.is_dir() {
RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap() result.prefixes.push(stripped);
} else { } else {
key result.keys.push(stripped);
};
let relative_key = format!("{}", relative_key);
if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
let first_part = relative_key
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
.next()
.unwrap()
.to_owned();
prefixes.insert(first_part);
} else {
result
.keys
.push(RemotePath::from_string(&relative_key).unwrap());
}
} }
result.prefixes = prefixes
.into_iter()
.map(|s| RemotePath::from_string(&s).unwrap())
.collect();
} }
if let Some(max_keys) = max_keys {
result.keys.truncate(max_keys.get() as usize);
}
Ok(result) Ok(result)
}; };
@@ -605,22 +554,56 @@ impl RemoteStorage for LocalFs {
) -> Result<(), TimeTravelError> { ) -> Result<(), TimeTravelError> {
Err(TimeTravelError::Unimplemented) Err(TimeTravelError::Unimplemented)
} }
fn activity(&self) -> RemoteStorageActivity {
// LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
RemoteStorageActivity {
read_available: 16,
read_total: 16,
write_available: 16,
write_total: 16,
}
}
} }
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
path_with_suffix_extension(original_path, "metadata") path_with_suffix_extension(original_path, "metadata")
} }
fn get_all_files<'a, P>(
directory_path: P,
recursive: bool,
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
where
P: AsRef<Utf8Path> + Send + Sync + 'a,
{
Box::pin(async move {
let directory_path = directory_path.as_ref();
if directory_path.exists() {
if directory_path.is_dir() {
let mut paths = Vec::new();
let mut dir_contents = fs::read_dir(directory_path).await?;
while let Some(dir_entry) = dir_contents.next_entry().await? {
let file_type = dir_entry.file_type().await?;
let entry_path =
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
anyhow::Error::msg(format!(
"non-Unicode path: {}",
pb.to_string_lossy()
))
})?;
if file_type.is_symlink() {
debug!("{entry_path:?} is a symlink, skipping")
} else if file_type.is_dir() {
if recursive {
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
} else {
paths.push(entry_path)
}
} else {
paths.push(entry_path);
}
}
Ok(paths)
} else {
bail!("Path {directory_path:?} is not a directory")
}
} else {
Ok(Vec::new())
}
})
}
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> { async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
let target_dir = match target_file_path.parent() { let target_dir = match target_file_path.parent() {
Some(parent_dir) => parent_dir, Some(parent_dir) => parent_dir,
@@ -940,18 +923,13 @@ mod fs_tests {
// No delimiter: should recursively list everything // No delimiter: should recursively list everything
let (storage, cancel) = create_storage()?; let (storage, cancel) = create_storage()?;
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?; let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
let child_sibling =
upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?; let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
let listing = storage let listing = storage
.list(None, ListingMode::NoDelimiter, None, &cancel) .list(None, ListingMode::NoDelimiter, None, &cancel)
.await?; .await?;
assert!(listing.prefixes.is_empty()); assert!(listing.prefixes.is_empty());
assert_eq!( assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
listing.keys.into_iter().collect::<HashSet<_>>(),
HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
);
// Delimiter: should only go one deep // Delimiter: should only go one deep
let listing = storage let listing = storage
@@ -964,25 +942,7 @@ mod fs_tests {
); );
assert!(listing.keys.is_empty()); assert!(listing.keys.is_empty());
// Delimiter & prefix with a trailing slash // Delimiter & prefix
let listing = storage
.list(
Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?;
assert_eq!(
listing.keys,
[RemotePath::from_string("uncle").unwrap()].to_vec()
);
assert_eq!(
listing.prefixes,
[RemotePath::from_string("parent").unwrap()].to_vec()
);
// Delimiter and prefix without a trailing slash
let listing = storage let listing = storage
.list( .list(
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()), Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -991,66 +951,12 @@ mod fs_tests {
&cancel, &cancel,
) )
.await?; .await?;
assert_eq!(listing.keys, [].to_vec());
assert_eq!( assert_eq!(
listing.prefixes, listing.prefixes,
[RemotePath::from_string("grandparent").unwrap()].to_vec() [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
); .to_vec()
// Delimiter and prefix that's partway through a path component
let listing = storage
.list(
Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?;
assert_eq!(listing.keys, [].to_vec());
assert_eq!(
listing.prefixes,
[RemotePath::from_string("grandparent").unwrap()].to_vec()
);
Ok(())
}
#[tokio::test]
async fn list_part_component() -> anyhow::Result<()> {
// No delimiter: should recursively list everything
let (storage, cancel) = create_storage()?;
// Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
// of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
// a freeform prefix.
let _child_a =
upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
let _child_b =
upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
// Delimiter and prefix that's partway through a path component
let listing = storage
.list(
Some(
&RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?;
assert_eq!(listing.keys, [].to_vec());
let mut found_prefixes = listing.prefixes.clone();
found_prefixes.sort();
assert_eq!(
found_prefixes,
[
RemotePath::from_string("tenant").unwrap(),
RemotePath::from_string("tenant-01").unwrap(),
]
.to_vec()
); );
assert_eq!(listing.keys, [uncle.clone()].to_vec());
Ok(()) Ok(())
} }

View File

@@ -27,10 +27,10 @@ use aws_config::{
}; };
use aws_credential_types::provider::SharedCredentialsProvider; use aws_credential_types::provider::SharedCredentialsProvider;
use aws_sdk_s3::{ use aws_sdk_s3::{
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
error::SdkError, error::SdkError,
operation::get_object::GetObjectError, operation::get_object::GetObjectError,
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
Client, Client,
}; };
use aws_smithy_async::rt::sleep::TokioSleep; use aws_smithy_async::rt::sleep::TokioSleep;
@@ -47,8 +47,8 @@ use utils::backoff;
use super::StorageMetadata; use super::StorageMetadata;
use crate::{ use crate::{
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config, Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
}; };
pub(super) mod metrics; pub(super) mod metrics;
@@ -62,7 +62,6 @@ pub struct S3Bucket {
bucket_name: String, bucket_name: String,
prefix_in_bucket: Option<String>, prefix_in_bucket: Option<String>,
max_keys_per_list_response: Option<i32>, max_keys_per_list_response: Option<i32>,
upload_storage_class: Option<StorageClass>,
concurrency_limiter: ConcurrencyLimiter, concurrency_limiter: ConcurrencyLimiter,
// Per-request timeout. Accessible for tests. // Per-request timeout. Accessible for tests.
pub timeout: Duration, pub timeout: Duration,
@@ -75,13 +74,13 @@ struct GetObjectRequest {
} }
impl S3Bucket { impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> { pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
tracing::debug!( tracing::debug!(
"Creating s3 remote storage for S3 bucket {}", "Creating s3 remote storage for S3 bucket {}",
remote_storage_config.bucket_name aws_config.bucket_name
); );
let region = Some(Region::new(remote_storage_config.bucket_region.clone())); let region = Some(Region::new(aws_config.bucket_region.clone()));
let provider_conf = ProviderConfig::without_region().with_region(region.clone()); let provider_conf = ProviderConfig::without_region().with_region(region.clone());
@@ -113,38 +112,6 @@ impl S3Bucket {
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new()); let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults(
#[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
BehaviorVersion::v2023_11_09(),
)
.region(region)
.identity_cache(IdentityCache::lazy().build())
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
s.spawn(|| {
// TODO: make this function async.
tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap()
.block_on(sdk_config_loader.load())
})
.join()
.unwrap()
});
let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
// Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions.
// (In case we ever re-use the `sdk_config` for more than just the S3 client in the future)
if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() {
s3_config_builder = s3_config_builder
.endpoint_url(custom_endpoint)
.force_path_style(true);
}
// We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
// responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
// attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled. // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
@@ -152,36 +119,41 @@ impl S3Bucket {
retry_config retry_config
.set_max_attempts(Some(1)) .set_max_attempts(Some(1))
.set_mode(Some(RetryMode::Adaptive)); .set_mode(Some(RetryMode::Adaptive));
s3_config_builder = s3_config_builder.retry_config(retry_config.build());
let s3_config = s3_config_builder.build(); let mut config_builder = Builder::default()
let client = aws_sdk_s3::Client::from_conf(s3_config); .behavior_version(BehaviorVersion::v2023_11_09())
.region(region)
.identity_cache(IdentityCache::lazy().build())
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
.retry_config(retry_config.build())
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
let prefix_in_bucket = remote_storage_config if let Some(custom_endpoint) = aws_config.endpoint.clone() {
.prefix_in_bucket config_builder = config_builder
.as_deref() .endpoint_url(custom_endpoint)
.map(|prefix| { .force_path_style(true);
let mut prefix = prefix; }
while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix = &prefix[1..]
}
let mut prefix = prefix.to_string(); let client = Client::from_conf(config_builder.build());
while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix.pop();
}
prefix
});
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
let mut prefix = prefix;
while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix = &prefix[1..]
}
let mut prefix = prefix.to_string();
while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix.pop();
}
prefix
});
Ok(Self { Ok(Self {
client, client,
bucket_name: remote_storage_config.bucket_name.clone(), bucket_name: aws_config.bucket_name.clone(),
max_keys_per_list_response: remote_storage_config.max_keys_per_list_response, max_keys_per_list_response: aws_config.max_keys_per_list_response,
prefix_in_bucket, prefix_in_bucket,
concurrency_limiter: ConcurrencyLimiter::new( concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
remote_storage_config.concurrency_limit.get(),
),
upload_storage_class: remote_storage_config.upload_storage_class.clone(),
timeout, timeout,
}) })
} }
@@ -206,7 +178,10 @@ impl S3Bucket {
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String { pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
let path_string = path.get_path().as_str(); let path_string = path
.get_path()
.as_str()
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
match &self.prefix_in_bucket { match &self.prefix_in_bucket {
Some(prefix) => prefix.clone() + "/" + path_string, Some(prefix) => prefix.clone() + "/" + path_string,
None => path_string.to_string(), None => path_string.to_string(),
@@ -496,11 +471,16 @@ impl RemoteStorage for S3Bucket {
// get the passed prefix or if it is not set use prefix_in_bucket value // get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix let list_prefix = prefix
.map(|p| self.relative_path_to_s3_object(p)) .map(|p| self.relative_path_to_s3_object(p))
.or_else(|| { .or_else(|| self.prefix_in_bucket.clone())
self.prefix_in_bucket.clone().map(|mut s| { .map(|mut p| {
s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); // required to end with a separator
s // otherwise request will return only the entry of a prefix
}) if matches!(mode, ListingMode::WithDelimiter)
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
{
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
}
p
}); });
let _permit = self.permit(kind, cancel).await?; let _permit = self.permit(kind, cancel).await?;
@@ -569,15 +549,11 @@ impl RemoteStorage for S3Bucket {
} }
} }
// S3 gives us prefixes like "foo/", we return them like "foo" result.prefixes.extend(
result.prefixes.extend(prefixes.iter().filter_map(|o| { prefixes
Some( .iter()
self.s3_object_to_relative_path( .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
o.prefix()? );
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
),
)
}));
continuation_token = match response.next_continuation_token { continuation_token = match response.next_continuation_token {
Some(new_token) => Some(new_token), Some(new_token) => Some(new_token),
@@ -610,7 +586,6 @@ impl RemoteStorage for S3Bucket {
.bucket(self.bucket_name.clone()) .bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to)) .key(self.relative_path_to_s3_object(to))
.set_metadata(metadata.map(|m| m.0)) .set_metadata(metadata.map(|m| m.0))
.set_storage_class(self.upload_storage_class.clone())
.content_length(from_size_bytes.try_into()?) .content_length(from_size_bytes.try_into()?)
.body(bytes_stream) .body(bytes_stream)
.send(); .send();
@@ -662,7 +637,6 @@ impl RemoteStorage for S3Bucket {
.copy_object() .copy_object()
.bucket(self.bucket_name.clone()) .bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to)) .key(self.relative_path_to_s3_object(to))
.set_storage_class(self.upload_storage_class.clone())
.copy_source(copy_source) .copy_source(copy_source)
.send(); .send();
@@ -920,7 +894,6 @@ impl RemoteStorage for S3Bucket {
.copy_object() .copy_object()
.bucket(self.bucket_name.clone()) .bucket(self.bucket_name.clone())
.key(key) .key(key)
.set_storage_class(self.upload_storage_class.clone())
.copy_source(&source_id) .copy_source(&source_id)
.send(); .send();
@@ -975,10 +948,6 @@ impl RemoteStorage for S3Bucket {
} }
Ok(()) Ok(())
} }
fn activity(&self) -> RemoteStorageActivity {
self.concurrency_limiter.activity()
}
} }
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`]. /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
@@ -1081,22 +1050,22 @@ mod tests {
Some("/test/prefix/"), Some("/test/prefix/"),
]; ];
let expected_outputs = [ let expected_outputs = [
vec!["", "some/path", "some/path/"], vec!["", "some/path", "some/path"],
vec!["/", "/some/path", "/some/path/"], vec!["/", "/some/path", "/some/path"],
vec![ vec![
"test/prefix/", "test/prefix/",
"test/prefix/some/path", "test/prefix/some/path",
"test/prefix/some/path/", "test/prefix/some/path",
], ],
vec![ vec![
"test/prefix/", "test/prefix/",
"test/prefix/some/path", "test/prefix/some/path",
"test/prefix/some/path/", "test/prefix/some/path",
], ],
vec![ vec![
"test/prefix/", "test/prefix/",
"test/prefix/some/path", "test/prefix/some/path",
"test/prefix/some/path/", "test/prefix/some/path",
], ],
]; ];
@@ -1108,7 +1077,6 @@ mod tests {
endpoint: None, endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(), concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: Some(5), max_keys_per_list_response: Some(5),
upload_storage_class: None,
}; };
let storage = let storage =
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init"); S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");

View File

@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
use crate::{ use crate::{
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage, Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
RemoteStorageActivity, StorageMetadata, TimeTravelError, StorageMetadata, TimeTravelError,
}; };
pub struct UnreliableWrapper { pub struct UnreliableWrapper {
@@ -107,6 +107,27 @@ impl UnreliableWrapper {
type VoidStorage = crate::LocalFs; type VoidStorage = crate::LocalFs;
impl RemoteStorage for UnreliableWrapper { impl RemoteStorage for UnreliableWrapper {
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
.map_err(DownloadError::Other)?;
self.inner.list_prefixes(prefix, cancel).await
}
async fn list_files(
&self,
folder: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
.map_err(DownloadError::Other)?;
self.inner.list_files(folder, max_keys, cancel).await
}
async fn list( async fn list(
&self, &self,
prefix: Option<&RemotePath>, prefix: Option<&RemotePath>,
@@ -213,8 +234,4 @@ impl RemoteStorage for UnreliableWrapper {
.time_travel_recover(prefix, timestamp, done_if_after, cancel) .time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await .await
} }
fn activity(&self) -> RemoteStorageActivity {
self.inner.activity()
}
} }

Some files were not shown because too many files have changed in this diff Show More