Compare commits

..

5 Commits

Author SHA1 Message Date
Vlad Lazar
10df5a650b wip 2024-02-01 16:26:43 +00:00
Vlad Lazar
d37b99c7b1 pageserver: lift ancestor timeline logic from read path
When the read path needs to follow a key into the ancestor timeline,
it needs to wait for said ancestor to become active and aware of
it's branching lsn. The logic is lifted into a separate function with
it's own new error type.

This is done because the vectored read path needs the same logic.
It's also the reason for the newly introduced error type.
2024-02-01 16:26:22 +00:00
Vlad Lazar
4c6627f3d9 pageserver_api: remove overlaps from KeySpace
This commit adds a function to `KeySpace` which updates a key key space
by removing all overlaps with a second key space. This can involve
splitting or removing of existing ranges.

The implementation is not particularly efficient: O(M * N * log(N))
where N is the number of ranges in the current key space and M is the
number of ranges in the key space we are checking against. In practice,
this shouldn't matter much since, in the short term, the only caller
of this function will be the vectored read path and the number of key
spaces invovled will be small. This follows from the upper bound placed
on the number of keys accepted by the vectored read path.

A couple other small utility functions are added. They'll be used by the
vectored search path as well.
2024-01-31 10:15:40 +00:00
Vlad Lazar
ae77e5355c pagseserver: add range layer map lookup
Use a two pointer algorithm to collect all the layers that intersect
a given range. The trailing pointer tracks the start of the current
range (current location in the key space) and the forward pointer
tracks the next coverage change.

The result of the collection is an map from layer to range which is
ordered by LSN. The ordering will come in handy to the rest of the read
path for figuring out where to continue from.
2024-01-25 12:28:32 +00:00
Vlad Lazar
493760bb7a pageserver: add ranged layer coverage for all intersected layers 2024-01-25 10:58:30 +00:00
131 changed files with 3341 additions and 5443 deletions

View File

@@ -69,15 +69,7 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: |
/kaniko/executor \
--reproducible \
--snapshotMode=redo \
--skip-unused-stages \
--dockerfile ${{ inputs.dockerfile-path }} \
--cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
kaniko-arm:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -93,15 +85,7 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: |
/kaniko/executor \
--reproducible \
--snapshotMode=redo \
--skip-unused-stages \
--dockerfile ${{ inputs.dockerfile-path }} \
--cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
manifest:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -115,10 +99,7 @@ jobs:
steps:
- name: Create manifest
run: |
docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
- name: Push manifest
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}

View File

@@ -21,8 +21,6 @@ env:
COPT: '-Werror'
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
jobs:
check-permissions:
@@ -46,20 +44,6 @@ jobs:
exit 1
cancel-previous-e2e-tests:
needs: [ check-permissions ]
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- name: Cancel previous e2e-tests runs for this PR
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
gh workflow --repo neondatabase/cloud \
run cancel-previous-in-concurrency-group.yml \
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
tag:
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, small ]
@@ -202,11 +186,7 @@ jobs:
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# Raise locked memory limit for tokio-epoll-uring.
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
# io_uring will account the memory of the CQ and SQ as locked.
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
options: --init
strategy:
fail-fast: false
matrix:
@@ -360,12 +340,8 @@ jobs:
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run rust tests
env:
NEXTEST_RETRIES: 3
run: |
for io_engine in std-fs tokio-epoll-uring ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
done
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -443,8 +419,8 @@ jobs:
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
# Default shared memory is 64mb
options: --init --shm-size=512mb
strategy:
fail-fast: false
matrix:
@@ -472,7 +448,6 @@ jobs:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
- name: Merge and upload coverage data
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
@@ -483,13 +458,12 @@ jobs:
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
# Default shared memory is 64mb
options: --init --shm-size=512mb
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
strategy:
fail-fast: false
matrix:
# the amount of groups (N) should be reflected in `extra_params: --splits N ...`
pytest_split_group: [ 1, 2, 3, 4 ]
build_type: [ release ]
steps:
@@ -503,12 +477,11 @@ jobs:
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ github.ref_name == 'main' }}
extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
@@ -722,8 +695,7 @@ jobs:
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
}
}"

View File

@@ -124,12 +124,12 @@ jobs:
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
CARGO_FEATURES: --features testing
CARGO_FLAGS: --release
CARGO_FLAGS: --locked --release
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -210,20 +210,18 @@ jobs:
- name: Run cargo build
run: |
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run cargo test
env:
NEXTEST_RETRIES: 3
run: |
cargo nextest run $CARGO_FEATURES
cargo test $CARGO_FLAGS $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_s3
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
@@ -233,7 +231,7 @@ jobs:
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_azure
cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
check-codestyle-rust-arm:
timeout-minutes: 90

View File

@@ -20,51 +20,111 @@ defaults:
run:
shell: bash -euo pipefail {0}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
permissions: {}
jobs:
tag-image:
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
env:
ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
outputs:
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
steps:
- name: Install Crane & ECR helper
run: |
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Get source image digest
id: next-digest
run: |
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
if [ -z "${NEXT_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
exit 1
fi
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
- name: Get destination image digest (if already exists)
id: prev-digest
run: |
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
if [ -z "${PREV_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
else
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
fi
- name: Tag image
run: |
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
rollback-tag-image:
needs: tag-image
if: ${{ !success() }}
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
env:
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
steps:
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
- name: Install Crane & ECR helper
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
- uses: docker/login-action@v2
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v2
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- uses: actions/setup-go@v5
with:
go-version: '1.21'
- name: Install crane
- name: Configure ECR login
run: |
go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Copy images
- name: Restore previous tag if needed
run: |
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
if [ -z "${NEXT_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
exit 0
fi
if [ -z "${PREV_DIGEST}" ]; then
# I guess we should delete the tag here/untag the image, but crane does not support it
# - https://github.com/google/go-containerregistry/issues/999
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
exit 0
fi
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
else
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
fi

150
Cargo.lock generated
View File

@@ -278,7 +278,6 @@ dependencies = [
"camino",
"clap",
"control_plane",
"diesel",
"futures",
"git-version",
"hyper",
@@ -287,6 +286,7 @@ dependencies = [
"pageserver_client",
"postgres_backend",
"postgres_connection",
"scopeguard",
"serde",
"serde_json",
"thiserror",
@@ -1328,8 +1328,6 @@ dependencies = [
"clap",
"comfy-table",
"compute_api",
"diesel",
"diesel_migrations",
"futures",
"git-version",
"hex",
@@ -1344,7 +1342,6 @@ dependencies = [
"regex",
"reqwest",
"safekeeper_api",
"scopeguard",
"serde",
"serde_json",
"serde_with",
@@ -1640,52 +1637,6 @@ dependencies = [
"rusticata-macros",
]
[[package]]
name = "diesel"
version = "2.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
dependencies = [
"bitflags 2.4.1",
"byteorder",
"diesel_derives",
"itoa",
"pq-sys",
"serde_json",
]
[[package]]
name = "diesel_derives"
version = "2.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
dependencies = [
"diesel_table_macro_syntax",
"proc-macro2",
"quote",
"syn 2.0.32",
]
[[package]]
name = "diesel_migrations"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
dependencies = [
"diesel",
"migrations_internals",
"migrations_macros",
]
[[package]]
name = "diesel_table_macro_syntax"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
dependencies = [
"syn 2.0.32",
]
[[package]]
name = "digest"
version = "0.10.7"
@@ -2612,16 +2563,6 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "io-uring"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460648e47a07a43110fbfa2e0b14afb2be920093c31e5dccc50e49568e099762"
dependencies = [
"bitflags 1.3.2",
"libc",
]
[[package]]
name = "ipnet"
version = "2.9.0"
@@ -2736,12 +2677,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "libm"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "linux-raw-sys"
version = "0.1.4"
@@ -2838,33 +2773,9 @@ dependencies = [
"libc",
"once_cell",
"prometheus",
"rand 0.8.5",
"rand_distr",
"twox-hash",
"workspace_hack",
]
[[package]]
name = "migrations_internals"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
dependencies = [
"serde",
"toml",
]
[[package]]
name = "migrations_macros"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
dependencies = [
"migrations_internals",
"proc-macro2",
"quote",
]
[[package]]
name = "mime"
version = "0.3.17"
@@ -3066,7 +2977,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
dependencies = [
"autocfg",
"libm",
]
[[package]]
@@ -3451,7 +3361,6 @@ dependencies = [
"tenant_size_model",
"thiserror",
"tokio",
"tokio-epoll-uring",
"tokio-io-timeout",
"tokio-postgres",
"tokio-stream",
@@ -3477,6 +3386,7 @@ dependencies = [
"enum-map",
"hex",
"humantime-serde",
"itertools",
"postgres_ffi",
"rand 0.8.5",
"serde",
@@ -3874,15 +3784,6 @@ version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "pq-sys"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
dependencies = [
"vcpkg",
]
[[package]]
name = "pq_proto"
version = "0.1.0"
@@ -4181,16 +4082,6 @@ dependencies = [
"getrandom 0.2.11",
]
[[package]]
name = "rand_distr"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
dependencies = [
"num-traits",
"rand 0.8.5",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
@@ -5223,9 +5114,9 @@ checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
[[package]]
name = "smol_str"
version = "0.2.1"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49"
checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
dependencies = [
"serde",
]
@@ -5492,18 +5383,18 @@ dependencies = [
[[package]]
name = "thiserror"
version = "1.0.47"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.47"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
dependencies = [
"proc-macro2",
"quote",
@@ -5627,21 +5518,6 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "tokio-epoll-uring"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
dependencies = [
"futures",
"once_cell",
"scopeguard",
"thiserror",
"tokio",
"tokio-util",
"tracing",
"uring-common",
]
[[package]]
name = "tokio-io-timeout"
version = "1.2.0"
@@ -6151,15 +6027,6 @@ dependencies = [
"webpki-roots 0.23.1",
]
[[package]]
name = "uring-common"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
dependencies = [
"io-uring",
"libc",
]
[[package]]
name = "url"
version = "2.3.1"
@@ -6721,7 +6588,6 @@ dependencies = [
"clap",
"clap_builder",
"crossbeam-utils",
"diesel",
"either",
"fail",
"futures-channel",

View File

@@ -151,7 +151,6 @@ test-context = "0.1"
thiserror = "1.0"
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
tokio = { version = "1.17", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.10.0"
tokio-rustls = "0.24"
@@ -165,7 +164,6 @@ tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.20.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
twox-hash = { version = "1.6.3", default-features = false }
url = "2.2"
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"

View File

@@ -53,7 +53,6 @@ RUN set -e \
--bin pagectl \
--bin safekeeper \
--bin storage_broker \
--bin attachment_service \
--bin proxy \
--bin neon_local \
--locked --release \
@@ -81,7 +80,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin

View File

@@ -52,7 +52,7 @@ RUN cd postgres && \
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
# In vanilla postgres this function is limited to Postgres role superuser.
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
# so we do it here.
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
@@ -63,14 +63,14 @@ RUN cd postgres && \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
fi; \
done; \
# the second loop is for pg_stat_statement extension versions >= 1.7,
# the second loop is for pg_stat_statement extension versions >= 1.7,
# where pg_stat_statement_reset() got 3 additional arguments
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
if ! echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
fi; \
done
done
#########################################################################################
#
@@ -144,23 +144,30 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
FROM build-deps AS plv8-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ARG PG_VERSION
RUN apt update && \
apt install -y ninja-build python3-dev libncurses5 binutils clang
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export PLV8_VERSION=3.1.5 \
export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
;; \
"v16") \
export PLV8_VERSION=3.1.8 \
export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
;; \
*) \
echo "Export the valid PG_VERSION variable" && exit 1 \
;; \
esac && \
wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
# generate and copy upgrade scripts
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
cp upgrade/* /usr/local/pgsql/share/extension/ && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
rm -rf /plv8-* && \
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
# don't break computes with installed old version of plv8
cd /usr/local/pgsql/lib/ && \
ln -s plv8-3.1.10.so plv8-3.1.5.so && \
ln -s plv8-3.1.10.so plv8-3.1.8.so && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -520,7 +527,8 @@ RUN apt-get update && \
libboost-regex1.74-dev \
libboost-serialization1.74-dev \
libboost-system1.74-dev \
libeigen3-dev
libeigen3-dev \
libfreetype6-dev
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
@@ -545,8 +553,6 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
-D RDK_INSTALL_INTREE=OFF \
-D RDK_INSTALL_COMIC_FONTS=OFF \
-D RDK_BUILD_FREETYPE_SUPPORT=OFF \
-D CMAKE_BUILD_TYPE=Release \
. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -901,7 +907,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libxml2, libxslt1.1 for xml2
# libzstd1 for zstd
# libboost* for rdkit
# libboost*, libfreetype6, and zlib1g for rdkit
# ca-certificates for communicating with s3 by compute_ctl
RUN apt update && \
apt install --no-install-recommends -y \
@@ -914,6 +920,7 @@ RUN apt update && \
libboost-serialization1.74.0 \
libboost-system1.74.0 \
libossp-uuid16 \
libfreetype6 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
@@ -925,6 +932,7 @@ RUN apt update && \
libcurl4-openssl-dev \
locales \
procps \
zlib1g \
ca-certificates && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

View File

@@ -10,8 +10,6 @@ async-trait.workspace = true
camino.workspace = true
clap.workspace = true
comfy-table.workspace = true
diesel = { version = "2.1.4", features = ["postgres"]}
diesel_migrations = { version = "2.1.0", features = ["postgres"]}
futures.workspace = true
git-version.workspace = true
nix.workspace = true
@@ -21,7 +19,6 @@ hex.workspace = true
hyper.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["blocking", "json"] }
scopeguard.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_with.workspace = true

View File

@@ -14,6 +14,7 @@ hyper.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
postgres_connection.workspace = true
scopeguard.workspace = true
serde.workspace = true
serde_json.workspace = true
thiserror.workspace = true
@@ -25,8 +26,6 @@ tracing.workspace = true
# a parsing function when loading pageservers from neon_local LocalEnv
postgres_backend.workspace = true
diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
utils = { path = "../../libs/utils/" }
metrics = { path = "../../libs/metrics/" }
control_plane = { path = ".." }

View File

@@ -1,6 +0,0 @@
-- This file was automatically created by Diesel to setup helper functions
-- and other internal bookkeeping. This file is safe to edit, any future
-- changes will be added to existing projects as new migrations.
DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
DROP FUNCTION IF EXISTS diesel_set_updated_at();

View File

@@ -1,36 +0,0 @@
-- This file was automatically created by Diesel to setup helper functions
-- and other internal bookkeeping. This file is safe to edit, any future
-- changes will be added to existing projects as new migrations.
-- Sets up a trigger for the given table to automatically set a column called
-- `updated_at` whenever the row is modified (unless `updated_at` was included
-- in the modified columns)
--
-- # Example
--
-- ```sql
-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
--
-- SELECT diesel_manage_updated_at('users');
-- ```
CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
BEGIN
EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
BEGIN
IF (
NEW IS DISTINCT FROM OLD AND
NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
) THEN
NEW.updated_at := current_timestamp;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;

View File

@@ -1,12 +0,0 @@
CREATE TABLE tenant_shards (
tenant_id VARCHAR NOT NULL,
shard_number INTEGER NOT NULL,
shard_count INTEGER NOT NULL,
PRIMARY KEY(tenant_id, shard_number, shard_count),
shard_stripe_size INTEGER NOT NULL,
generation INTEGER NOT NULL,
generation_pageserver BIGINT NOT NULL,
placement_policy VARCHAR NOT NULL,
-- config is JSON encoded, opaque to the database.
config TEXT NOT NULL
);

View File

@@ -1,10 +0,0 @@
CREATE TABLE nodes (
node_id BIGINT PRIMARY KEY NOT NULL,
scheduling_policy VARCHAR NOT NULL,
listen_http_addr VARCHAR NOT NULL,
listen_http_port INTEGER NOT NULL,
listen_pg_addr VARCHAR NOT NULL,
listen_pg_port INTEGER NOT NULL
);

View File

@@ -1,5 +1,5 @@
use crate::reconciler::ReconcileError;
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
use crate::service::Service;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
@@ -104,34 +104,34 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
json_response(StatusCode::OK, state.service.inspect(inspect_req))
}
async fn handle_tenant_create(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
}
async fn handle_tenant_timeline_create(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
service
state.service.tenant_create(create_req).await?,
)
}
async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
state
.service
.tenant_timeline_create(tenant_id, create_req)
.await?,
)
}
async fn handle_tenant_locate(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
let state = get_state(&req);
json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -154,15 +154,14 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
}
async fn handle_tenant_shard_migrate(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
service
state
.service
.tenant_shard_migrate(tenant_shard_id, migrate_req)
.await?,
)
@@ -179,35 +178,6 @@ impl From<ReconcileError> for ApiError {
}
}
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
/// be allowed to run if Service has finished its initial reconciliation.
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
where
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
{
let state = get_state(&request);
let service = state.service.clone();
let startup_complete = service.startup_complete.clone();
if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
.await
.is_err()
{
// This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
// timeouts around its remote calls, to bound its runtime.
return Err(ApiError::Timeout(
"Timed out waiting for service readiness".into(),
));
}
request_span(
request,
|request| async move { handler(service, request).await },
)
.await
}
pub fn make_router(
service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>,
@@ -235,20 +205,14 @@ pub fn make_router(
.put("/node/:node_id/config", |r| {
request_span(r, handle_node_configure)
})
.post("/v1/tenant", |r| {
tenant_service_handler(r, handle_tenant_create)
})
.post("/v1/tenant/:tenant_id/timeline", |r| {
tenant_service_handler(r, handle_tenant_timeline_create)
.post("/tenant", |r| request_span(r, handle_tenant_create))
.post("/tenant/:tenant_id/timeline", |r| {
request_span(r, handle_tenant_timeline_create)
})
.get("/tenant/:tenant_id/locate", |r| {
tenant_service_handler(r, handle_tenant_locate)
request_span(r, handle_tenant_locate)
})
.put("/tenant/:tenant_shard_id/migrate", |r| {
tenant_service_handler(r, handle_tenant_shard_migrate)
request_span(r, handle_tenant_shard_migrate)
})
// Path aliases for tests_forward_compatibility
// TODO: remove these in future PR
.post("/re-attach", |r| request_span(r, handle_re_attach))
.post("/validate", |r| request_span(r, handle_validate))
}

View File

@@ -7,7 +7,6 @@ mod node;
pub mod persistence;
mod reconciler;
mod scheduler;
mod schema;
pub mod service;
mod tenant_state;

View File

@@ -12,9 +12,9 @@ use camino::Utf8PathBuf;
use clap::Parser;
use metrics::launch_timestamp::LaunchTimestamp;
use std::sync::Arc;
use tokio::signal::unix::SignalKind;
use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat};
use utils::signals::{ShutdownSignals, Signal};
use utils::{project_build_tag, project_git_version, tcp_listener};
@@ -40,10 +40,6 @@ struct Cli {
/// Path to the .json file to store state (will be created if it doesn't exist)
#[arg(short, long)]
path: Utf8PathBuf,
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
#[arg(long)]
database_url: String,
}
#[tokio::main]
@@ -70,14 +66,9 @@ async fn main() -> anyhow::Result<()> {
jwt_token: args.jwt_token,
};
let json_path = if args.path.as_os_str().is_empty() {
None
} else {
Some(args.path)
};
let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
let persistence = Arc::new(Persistence::spawn(&args.path).await);
let service = Service::spawn(config, persistence.clone()).await?;
let service = Service::spawn(config, persistence).await?;
let http_listener = tcp_listener::bind(args.listen)?;
@@ -90,31 +81,20 @@ async fn main() -> anyhow::Result<()> {
let router = make_router(service, auth)
.build()
.map_err(|err| anyhow!(err))?;
let router_service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
tracing::info!("Serving on {0}", args.listen);
tokio::task::spawn(server);
// Wait until we receive a signal
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
tokio::select! {
_ = sigint.recv() => {},
_ = sigterm.recv() => {},
_ = sigquit.recv() => {},
}
tracing::info!("Terminating on signal");
if json_path.is_some() {
// Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
// full postgres dumps around.
if let Err(e) = persistence.write_tenants_json().await {
tracing::error!("Failed to write JSON on shutdown: {e}")
ShutdownSignals::handle(|signal| match signal {
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
tracing::info!("Got {}. Terminating", signal.name());
// We're just a test helper: no graceful shutdown.
std::process::exit(0);
}
}
})?;
std::process::exit(0);
Ok(())
}

View File

@@ -1,8 +1,6 @@
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use utils::id::NodeId;
use crate::persistence::NodePersistence;
#[derive(Clone)]
pub(crate) struct Node {
pub(crate) id: NodeId,
@@ -36,15 +34,4 @@ impl Node {
NodeSchedulingPolicy::Pause => false,
}
}
pub(crate) fn to_persistent(&self) -> NodePersistence {
NodePersistence {
node_id: self.id.0 as i64,
scheduling_policy: self.scheduling.into(),
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port as i32,
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port as i32,
}
}
}

View File

@@ -1,161 +1,182 @@
use std::collections::HashMap;
use std::str::FromStr;
use std::{collections::HashMap, str::FromStr};
use camino::Utf8Path;
use camino::Utf8PathBuf;
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use diesel::pg::PgConnection;
use diesel::prelude::*;
use diesel::Connection;
use pageserver_api::models::TenantConfig;
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
use camino::{Utf8Path, Utf8PathBuf};
use control_plane::{
attachment_service::{NodeAvailability, NodeSchedulingPolicy},
local_env::LocalEnv,
};
use pageserver_api::{
models::TenantConfig,
shard::{ShardCount, ShardNumber, TenantShardId},
};
use postgres_connection::parse_host_port;
use serde::{Deserialize, Serialize};
use utils::generation::Generation;
use utils::id::{NodeId, TenantId};
use tracing::info;
use utils::{
generation::Generation,
id::{NodeId, TenantId},
};
use crate::node::Node;
use crate::PlacementPolicy;
use crate::{node::Node, PlacementPolicy};
/// ## What do we store?
///
/// The attachment service does not store most of its state durably.
///
/// The essential things to store durably are:
/// - generation numbers, as these must always advance monotonically to ensure data safety.
/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
/// - Node's scheduling policies, as the source of truth for these is something external.
///
/// Other things we store durably as an implementation detail:
/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
/// but it is operationally simpler to make this service the authority for which nodes
/// it talks to.
///
/// ## Performance/efficiency
///
/// The attachment service does not go via the database for most things: there are
/// a couple of places where we must, and where efficiency matters:
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
/// before it can attach a tenant, so this acts as a bound on how fast things like
/// failover can happen.
/// - Pageserver re-attach: we will increment many shards' generations when this happens,
/// so it is important to avoid e.g. issuing O(N) queries.
///
/// Database calls relating to nodes have low performance requirements, as they are very rarely
/// updated, and reads of nodes are always from memory, not the database. We only require that
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
/// Placeholder for storage. This will be replaced with a database client.
pub struct Persistence {
database_url: String,
// In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of
// test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
// compatible just yet.
json_path: Option<Utf8PathBuf>,
inner: std::sync::Mutex<Inner>,
}
struct Inner {
state: PersistentState,
write_queue_tx: tokio::sync::mpsc::UnboundedSender<PendingWrite>,
}
/// Legacy format, for use in JSON compat objects in test environment
#[derive(Serialize, Deserialize)]
struct JsonPersistence {
struct PersistentState {
tenants: HashMap<TenantShardId, TenantShardPersistence>,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum DatabaseError {
#[error(transparent)]
Query(#[from] diesel::result::Error),
#[error(transparent)]
Connection(#[from] diesel::result::ConnectionError),
#[error("Logical error: {0}")]
Logical(String),
struct PendingWrite {
bytes: Vec<u8>,
done_tx: tokio::sync::oneshot::Sender<()>,
}
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
impl PersistentState {
async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
let bytes = tokio::fs::read(path).await?;
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
for (tenant_id, tenant) in &mut decoded.tenants {
// Backward compat: an old attachments.json from before PR #6251, replace
// empty strings with proper defaults.
if tenant.tenant_id.is_empty() {
tenant.tenant_id = format!("{}", tenant_id);
tenant.config = serde_json::to_string(&TenantConfig::default())?;
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
}
}
Ok(decoded)
}
async fn load_or_new(path: &Utf8Path) -> Self {
match Self::load(path).await {
Ok(s) => {
tracing::info!("Loaded state file at {}", path);
s
}
Err(e)
if e.downcast_ref::<std::io::Error>()
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
.unwrap_or(false) =>
{
tracing::info!("Will create state file at {}", path);
Self {
tenants: HashMap::new(),
}
}
Err(e) => {
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
}
}
}
}
impl Persistence {
pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
pub async fn spawn(path: &Utf8Path) -> Self {
let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
let state = PersistentState::load_or_new(path).await;
tokio::spawn(Self::writer_task(rx, path.to_owned()));
Self {
database_url,
json_path,
inner: std::sync::Mutex::new(Inner {
state,
write_queue_tx: tx,
}),
}
}
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
where
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
R: Send + 'static,
{
let database_url = self.database_url.clone();
tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
// TODO: connection pooling, such as via diesel::r2d2
let mut conn = PgConnection::establish(&database_url)?;
func(&mut conn)
})
.await
.expect("Task panic")
}
/// When a node is first registered, persist it before using it for anything
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
let np = node.to_persistent();
self.with_conn(move |conn| -> DatabaseResult<()> {
diesel::insert_into(crate::schema::nodes::table)
.values(&np)
.execute(conn)?;
Ok(())
})
.await
}
/// At startup, populate the list of nodes which our shards may be placed on
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
let nodes: Vec<Node> = self
.with_conn(move |conn| -> DatabaseResult<_> {
Ok(crate::schema::nodes::table
.load::<NodePersistence>(conn)?
.into_iter()
.map(|n| Node {
id: NodeId(n.node_id as u64),
// At startup we consider a node offline until proven otherwise.
availability: NodeAvailability::Offline,
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
.expect("Bad scheduling policy in DB"),
listen_http_addr: n.listen_http_addr,
listen_http_port: n.listen_http_port as u16,
listen_pg_addr: n.listen_pg_addr,
listen_pg_port: n.listen_pg_port as u16,
async fn writer_task(
mut rx: tokio::sync::mpsc::UnboundedReceiver<PendingWrite>,
path: Utf8PathBuf,
) {
scopeguard::defer! {
info!("persistence writer task exiting");
};
loop {
match rx.recv().await {
Some(write) => {
tokio::task::spawn_blocking({
let path = path.clone();
move || {
let tmp_path =
utils::crashsafe::path_with_suffix_extension(&path, "___new");
utils::crashsafe::overwrite(&path, &tmp_path, &write.bytes)
}
})
.collect::<Vec<Node>>())
})
.await?;
if nodes.is_empty() {
return self.list_nodes_local_env().await;
.await
.expect("spawn_blocking")
.expect("write file");
let _ = write.done_tx.send(()); // receiver may lose interest any time
}
None => {
return;
}
}
}
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
Ok(nodes)
}
/// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
// Enable test_backward_compatibility to work by populating our list of
/// Perform a modification on our [`PersistentState`].
/// Return a future that completes once our modification has been persisted.
/// The output of the future is the return value of the `txn`` closure.
async fn mutating_transaction<F, R>(&self, txn: F) -> R
where
F: FnOnce(&mut PersistentState) -> R,
{
let (ret, done_rx) = {
let mut inner = self.inner.lock().unwrap();
let ret = txn(&mut inner.state);
let (done_tx, done_rx) = tokio::sync::oneshot::channel();
let write = PendingWrite {
bytes: serde_json::to_vec(&inner.state).expect("Serialization error"),
done_tx,
};
inner
.write_queue_tx
.send(write)
.expect("writer task always outlives self");
(ret, done_rx)
};
// the write task can go away once we start .await'ing
let _: () = done_rx.await.expect("writer task dead, check logs");
ret
}
/// When registering a node, persist it so that on next start we will be able to
/// iterate over known nodes to synchronize their tenant shard states with our observed state.
pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
// TODO: node persitence will come with database backend
Ok(())
}
/// At startup, we populate the service's list of nodes, and use this list to call into
/// each node to do an initial reconciliation of the state of the world with our in-memory
/// observed state.
pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
let env = LocalEnv::load_config()?;
// TODO: node persitence will come with database backend
// XXX hack: enable test_backward_compatibility to work by populating our list of
// nodes from LocalEnv when it is not present in persistent storage. Otherwise at
// first startup in the compat test, we may have shards but no nodes.
use control_plane::local_env::LocalEnv;
let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
let mut result = Vec::new();
tracing::info!(
"Loading {} pageserver nodes from LocalEnv",
"Loaded {} pageserver nodes from LocalEnv",
env.pageservers.len()
);
let mut nodes = Vec::new();
for ps_conf in env.pageservers {
let (pg_host, pg_port) =
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
let node = Node {
result.push(Node {
id: ps_conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
@@ -163,96 +184,16 @@ impl Persistence {
listen_http_port: http_port.unwrap_or(80),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
};
// Synchronize database with what we learn from LocalEnv
self.insert_node(&node).await?;
nodes.push(node);
});
}
Ok(nodes)
Ok(result)
}
/// At startup, load the high level state for shards, such as their config + policy. This will
/// be enriched at runtime with state discovered on pageservers.
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
let loaded = self
.with_conn(move |conn| -> DatabaseResult<_> {
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
})
.await?;
if loaded.is_empty() {
if let Some(path) = &self.json_path {
if tokio::fs::try_exists(path)
.await
.map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
{
tracing::info!("Importing from legacy JSON format at {path}");
return self.list_tenant_shards_json(path).await;
}
}
}
Ok(loaded)
}
/// Shim for automated compatibility tests: load tenants from a JSON file instead of database
pub(crate) async fn list_tenant_shards_json(
&self,
path: &Utf8Path,
) -> DatabaseResult<Vec<TenantShardPersistence>> {
let bytes = tokio::fs::read(path)
.await
.map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
for (tenant_id, tenant) in &mut decoded.tenants {
// Backward compat: an old attachments.json from before PR #6251, replace
// empty strings with proper defaults.
if tenant.tenant_id.is_empty() {
tenant.tenant_id = tenant_id.to_string();
tenant.config = serde_json::to_string(&TenantConfig::default())
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
}
}
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
// Synchronize database with what is in the JSON file
self.insert_tenant_shards(tenants.clone()).await?;
Ok(tenants)
}
/// For use in testing environments, where we dump out JSON on shutdown.
pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
let Some(path) = &self.json_path else {
anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
};
tracing::info!("Writing state to {path}...");
let tenants = self.list_tenant_shards().await?;
let mut tenants_map = HashMap::new();
for tsp in tenants {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
tenants_map.insert(tenant_shard_id, tsp);
}
let json = serde_json::to_string(&JsonPersistence {
tenants: tenants_map,
})?;
tokio::fs::write(path, &json).await?;
tracing::info!("Wrote {} bytes to {path}...", json.len());
Ok(())
/// At startup, we populate our map of tenant shards from persistent storage.
pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
let inner = self.inner.lock().unwrap();
Ok(inner.state.tenants.values().cloned().collect())
}
/// Tenants must be persisted before we schedule them for the first time. This enables us
@@ -260,79 +201,22 @@ impl Persistence {
pub(crate) async fn insert_tenant_shards(
&self,
shards: Vec<TenantShardPersistence>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
conn.transaction(|conn| -> QueryResult<()> {
for tenant in &shards {
diesel::insert_into(tenant_shards)
.values(tenant)
.execute(conn)?;
}
Ok(())
})?;
) -> anyhow::Result<()> {
self.mutating_transaction(|locked| {
for shard in shards {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
shard_number: ShardNumber(shard.shard_number as u8),
shard_count: ShardCount(shard.shard_count as u8),
};
locked.tenants.insert(tenant_shard_id, shard);
}
Ok(())
})
.await
}
/// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
/// the tenant from memory on this server.
#[allow(unused)]
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
diesel::delete(tenant_shards)
.filter(tenant_id.eq(del_tenant_id.to_string()))
.execute(conn)?;
Ok(())
})
.await
}
/// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
/// batched increment of the generations of all tenants whose generation_pageserver is equal to
/// the node that called /re-attach.
#[tracing::instrument(skip_all, fields(node_id))]
pub(crate) async fn re_attach(
&self,
node_id: NodeId,
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
use crate::schema::tenant_shards::dsl::*;
let updated = self
.with_conn(move |conn| {
let rows_updated = diesel::update(tenant_shards)
.filter(generation_pageserver.eq(node_id.0 as i64))
.set(generation.eq(generation + 1))
.execute(conn)?;
tracing::info!("Incremented {} tenants' generations", rows_updated);
// TODO: UPDATE+SELECT in one query
let updated = tenant_shards
.filter(generation_pageserver.eq(node_id.0 as i64))
.select(TenantShardPersistence::as_select())
.load(conn)?;
Ok(updated)
})
.await?;
let mut result = HashMap::new();
for tsp in updated {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
}
Ok(result)
}
/// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
/// advancing generation number. We also store the NodeId for which the generation was issued, so that in
/// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
@@ -341,46 +225,47 @@ impl Persistence {
tenant_shard_id: TenantShardId,
node_id: NodeId,
) -> anyhow::Result<Generation> {
use crate::schema::tenant_shards::dsl::*;
let updated = self
.with_conn(move |conn| {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
.set((
generation.eq(generation + 1),
generation_pageserver.eq(node_id.0 as i64),
))
// TODO: only returning() the generation column
.returning(TenantShardPersistence::as_returning())
.get_result(conn)?;
self.mutating_transaction(|locked| {
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
anyhow::bail!("Tried to increment generation of unknown shard");
};
Ok(updated)
})
.await?;
shard.generation += 1;
shard.generation_pageserver = Some(node_id);
Ok(Generation::new(updated.generation as u32))
let gen = Generation::new(shard.generation);
Ok(gen)
})
.await
}
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
.set((
generation_pageserver.eq(i64::MAX),
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
))
.execute(conn)?;
Ok(updated)
self.mutating_transaction(|locked| {
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
anyhow::bail!("Tried to increment generation of unknown shard");
};
shard.generation_pageserver = None;
shard.placement_policy = serde_json::to_string(&PlacementPolicy::Detached).unwrap();
Ok(())
})
.await?;
.await
}
Ok(())
pub(crate) async fn re_attach(
&self,
node_id: NodeId,
) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
self.mutating_transaction(|locked| {
let mut result = HashMap::new();
for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
if shard.generation_pageserver == Some(node_id) {
shard.generation += 1;
result.insert(*tenant_shard_id, Generation::new(shard.generation));
}
}
Ok(result)
})
.await
}
// TODO: when we start shard splitting, we must durably mark the tenant so that
@@ -400,8 +285,7 @@ impl Persistence {
}
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
#[diesel(table_name = crate::schema::tenant_shards)]
#[derive(Serialize, Deserialize, Clone)]
pub(crate) struct TenantShardPersistence {
#[serde(default)]
pub(crate) tenant_id: String,
@@ -412,28 +296,16 @@ pub(crate) struct TenantShardPersistence {
#[serde(default)]
pub(crate) shard_stripe_size: i32,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching
pub(crate) generation: i32,
// Currently attached pageserver
#[serde(rename = "pageserver")]
pub(crate) generation_pageserver: i64,
pub(crate) generation_pageserver: Option<NodeId>,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching
pub(crate) generation: u32,
#[serde(default)]
pub(crate) placement_policy: String,
#[serde(default)]
pub(crate) config: String,
}
/// Parts of [`crate::node::Node`] that are stored durably
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
#[diesel(table_name = crate::schema::nodes)]
pub(crate) struct NodePersistence {
pub(crate) node_id: i64,
pub(crate) scheduling_policy: String,
pub(crate) listen_http_addr: String,
pub(crate) listen_http_port: i32,
pub(crate) listen_pg_addr: String,
pub(crate) listen_pg_port: i32,
}

View File

@@ -1,27 +0,0 @@
// @generated automatically by Diesel CLI.
diesel::table! {
nodes (node_id) {
node_id -> Int8,
scheduling_policy -> Varchar,
listen_http_addr -> Varchar,
listen_http_port -> Int4,
listen_pg_addr -> Varchar,
listen_pg_port -> Int4,
}
}
diesel::table! {
tenant_shards (tenant_id, shard_number, shard_count) {
tenant_id -> Varchar,
shard_number -> Int4,
shard_count -> Int4,
shard_stripe_size -> Int4,
generation -> Int4,
generation_pageserver -> Int8,
placement_policy -> Varchar,
config -> Text,
}
}
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);

View File

@@ -11,7 +11,6 @@ use control_plane::attachment_service::{
TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
TenantShardMigrateRequest, TenantShardMigrateResponse,
};
use diesel::result::DatabaseErrorKind;
use hyper::StatusCode;
use pageserver_api::{
control_api::{
@@ -27,7 +26,6 @@ use pageserver_api::{
};
use pageserver_client::mgmt_api;
use utils::{
completion::Barrier,
generation::Generation,
http::error::ApiError,
id::{NodeId, TenantId},
@@ -37,7 +35,7 @@ use utils::{
use crate::{
compute_hook::ComputeHook,
node::Node,
persistence::{DatabaseError, Persistence, TenantShardPersistence},
persistence::{Persistence, TenantShardPersistence},
scheduler::Scheduler,
tenant_state::{
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -48,10 +46,6 @@ use crate::{
const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
/// How long [`Service::startup_reconcile`] is allowed to take before it should give
/// up on unresponsive pageservers and proceed.
pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
// Top level state available to all HTTP handlers
struct ServiceState {
tenants: BTreeMap<TenantShardId, TenantState>,
@@ -85,27 +79,10 @@ pub struct Config {
pub jwt_token: Option<String>,
}
impl From<DatabaseError> for ApiError {
fn from(err: DatabaseError) -> ApiError {
match err {
DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
// FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
DatabaseError::Connection(_e) => ApiError::ShuttingDown,
DatabaseError::Logical(reason) => {
ApiError::InternalServerError(anyhow::anyhow!(reason))
}
}
}
}
pub struct Service {
inner: Arc<std::sync::RwLock<ServiceState>>,
config: Config,
persistence: Arc<Persistence>,
/// This waits for initial reconciliation with pageservers to complete. Until this barrier
/// passes, it isn't safe to do any actions that mutate tenants.
pub(crate) startup_complete: Barrier,
}
impl From<ReconcileWaitError> for ApiError {
@@ -119,32 +96,77 @@ impl From<ReconcileWaitError> for ApiError {
}
impl Service {
pub fn get_config(&self) -> &Config {
&self.config
}
pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
tracing::info!("Loading nodes from database...");
let mut nodes = persistence.list_nodes().await?;
tracing::info!("Loaded {} nodes from database.", nodes.len());
tracing::info!("Loading shards from database...");
let tenant_shard_persistence = persistence.list_tenant_shards().await?;
tracing::info!(
"Loaded {} shards from database.",
tenant_shard_persistence.len()
);
let mut tenants = BTreeMap::new();
for tsp in tenant_shard_persistence {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
let shard_identity = if tsp.shard_count == 0 {
ShardIdentity::unsharded()
} else {
ShardIdentity::new(
ShardNumber(tsp.shard_number as u8),
ShardCount(tsp.shard_count as u8),
ShardStripeSize(tsp.shard_stripe_size as u32),
)?
};
let new_tenant = TenantState {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
// Note that we load generation, but don't care about generation_pageserver. We will either end up finding
// our existing attached location and it will match generation_pageserver, or we will attach somewhere new
// and update generation_pageserver in the process.
generation: Generation::new(tsp.generation),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent: IntentState::new(),
observed: ObservedState::new(),
config: serde_json::from_str(&tsp.config).unwrap(),
reconciler: None,
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
};
tenants.insert(tenant_shard_id, new_tenant);
}
/// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
/// until this is done.
async fn startup_reconcile(&self) {
// For all tenant shards, a vector of observed states on nodes (where None means
// indeterminate, same as in [`ObservedStateLocation`])
let mut observed = HashMap::new();
let nodes = {
let locked = self.inner.read().unwrap();
locked.nodes.clone()
};
// TODO: issue these requests concurrently
for node in nodes.values() {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
for node in &mut nodes {
let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
tracing::info!("Scanning shards on node {}...", node.id);
match client.list_location_config().await {
Err(e) => {
tracing::warn!("Could not contact pageserver {} ({e})", node.id);
// TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
// pageserver is being restarted at the same time as we are
// TODO: be more tolerant, apply a generous 5-10 second timeout
// TODO: setting a node to Offline is a dramatic thing to do, and can
// prevent neon_local from starting up (it starts this service before
// any pageservers are running). It may make sense to give nodes
// a Pending state to accomodate this situation, and allow (but deprioritize)
// scheduling on Pending nodes.
//node.availability = NodeAvailability::Offline;
}
Ok(listing) => {
tracing::info!(
@@ -152,6 +174,7 @@ impl Service {
listing.tenant_shards.len(),
node.id
);
node.availability = NodeAvailability::Active;
for (tenant_shard_id, conf_opt) in listing.tenant_shards {
observed.insert(tenant_shard_id, (node.id, conf_opt));
@@ -163,46 +186,41 @@ impl Service {
let mut cleanup = Vec::new();
// Populate intent and observed states for all tenants, based on reported state on pageservers
let shard_count = {
let mut locked = self.inner.write().unwrap();
for (tenant_shard_id, (node_id, observed_loc)) in observed {
let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
cleanup.push((tenant_shard_id, node_id));
continue;
};
for (tenant_shard_id, (node_id, observed_loc)) in observed {
let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
cleanup.push((tenant_shard_id, node_id));
continue;
};
tenant_state
.observed
.locations
.insert(node_id, ObservedStateLocation { conf: observed_loc });
tenant_state
.observed
.locations
.insert(node_id, ObservedStateLocation { conf: observed_loc });
}
// State of nodes is now frozen, transform to a HashMap.
let mut nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
// Populate each tenant's intent state
let mut scheduler = Scheduler::new(&tenants, &nodes);
for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
tenant_state.intent_from_observed();
if let Err(e) = tenant_state.schedule(&mut scheduler) {
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
// not enough pageservers are available. The tenant may well still be available
// to clients.
tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
}
// Populate each tenant's intent state
let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
tenant_state.intent_from_observed();
if let Err(e) = tenant_state.schedule(&mut scheduler) {
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
// not enough pageservers are available. The tenant may well still be available
// to clients.
tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
}
}
locked.tenants.len()
};
// TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
// generation_pageserver in the database.
}
// Clean up any tenants that were found on pageservers but are not known to us.
for (tenant_shard_id, node_id) in cleanup {
// A node reported a tenant_shard_id which is unknown to us: detach it.
let node = nodes
.get(&node_id)
.get_mut(&node_id)
.expect("Always exists: only known nodes are scanned");
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
match client
.location_config(
tenant_shard_id,
@@ -234,80 +252,13 @@ impl Service {
}
}
// Finally, now that the service is up and running, launch reconcile operations for any tenants
// which require it: under normal circumstances this should only include tenants that were in some
// transient state before we restarted.
let reconcile_tasks = self.reconcile_all();
tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
}
pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
tracing::info!("Loading nodes from database...");
let nodes = persistence.list_nodes().await?;
let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
tracing::info!("Loaded {} nodes from database.", nodes.len());
tracing::info!("Loading shards from database...");
let tenant_shard_persistence = persistence.list_tenant_shards().await?;
tracing::info!(
"Loaded {} shards from database.",
tenant_shard_persistence.len()
);
let mut tenants = BTreeMap::new();
for tsp in tenant_shard_persistence {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
let shard_identity = if tsp.shard_count == 0 {
ShardIdentity::unsharded()
} else {
ShardIdentity::new(
ShardNumber(tsp.shard_number as u8),
ShardCount(tsp.shard_count as u8),
ShardStripeSize(tsp.shard_stripe_size as u32),
)?
};
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
// it with what we can infer: the node for which a generation was most recently issued.
let mut intent = IntentState::new();
if tsp.generation_pageserver != i64::MAX {
intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
}
let new_tenant = TenantState {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
generation: Generation::new(tsp.generation as u32),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent,
observed: ObservedState::new(),
config: serde_json::from_str(&tsp.config).unwrap(),
reconciler: None,
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
};
tenants.insert(tenant_shard_id, new_tenant);
}
let (startup_completion, startup_complete) = utils::completion::channel();
let shard_count = tenants.len();
let this = Arc::new(Self {
inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
result_tx, nodes, tenants,
))),
config,
persistence,
startup_complete,
});
let result_task_this = this.clone();
@@ -365,13 +316,11 @@ impl Service {
}
});
let startup_reconcile_this = this.clone();
tokio::task::spawn(async move {
// Block the [`Service::startup_complete`] barrier until we're done
let _completion = startup_completion;
startup_reconcile_this.startup_reconcile().await
});
// Finally, now that the service is up and running, launch reconcile operations for any tenants
// which require it: under normal circumstances this should only include tenants that were in some
// transient state before we restarted.
let reconcile_tasks = this.reconcile_all();
tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
Ok(this)
}
@@ -387,6 +336,7 @@ impl Service {
let locked = self.inner.write().unwrap();
!locked.tenants.contains_key(&attach_req.tenant_shard_id)
};
if insert {
let tsp = TenantShardPersistence {
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
@@ -394,39 +344,22 @@ impl Service {
shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
shard_stripe_size: 0,
generation: 0,
generation_pageserver: i64::MAX,
generation_pageserver: None,
placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
};
match self.persistence.insert_tenant_shards(vec![tsp]).await {
Err(e) => match e {
DatabaseError::Query(diesel::result::Error::DatabaseError(
DatabaseErrorKind::UniqueViolation,
_,
)) => {
tracing::info!(
"Raced with another request to insert tenant {}",
attach_req.tenant_shard_id
)
}
_ => return Err(e.into()),
},
Ok(()) => {
tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
self.persistence.insert_tenant_shards(vec![tsp]).await?;
let mut locked = self.inner.write().unwrap();
locked.tenants.insert(
attach_req.tenant_shard_id,
TenantState::new(
attach_req.tenant_shard_id,
ShardIdentity::unsharded(),
PlacementPolicy::Single,
),
);
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
}
}
let mut locked = self.inner.write().unwrap();
locked.tenants.insert(
attach_req.tenant_shard_id,
TenantState::new(
attach_req.tenant_shard_id,
ShardIdentity::unsharded(),
PlacementPolicy::Single,
),
);
}
let new_generation = if let Some(req_node_id) = attach_req.node_id {
@@ -573,14 +506,6 @@ impl Service {
id: req_tenant.id,
valid,
});
} else {
// After tenant deletion, we may approve any validation. This avoids
// spurious warnings on the pageserver if it has pending LSN updates
// at the point a deletion happens.
response.tenants.push(ValidateResponseTenant {
id: req_tenant.id,
valid: true,
});
}
}
response
@@ -636,7 +561,7 @@ impl Service {
shard_count: tenant_shard_id.shard_count.0 as i32,
shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
generation: 0,
generation_pageserver: i64::MAX,
generation_pageserver: None,
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
config: serde_json::to_string(&create_req.config).unwrap(),
})
@@ -1042,7 +967,10 @@ impl Service {
availability: NodeAvailability::Active,
};
// TODO: idempotency if the node already exists in the database
self.persistence.insert_node(&new_node).await?;
self.persistence
.insert_node(&new_node)
.await
.map_err(ApiError::InternalServerError)?;
let mut locked = self.inner.write().unwrap();
let mut new_nodes = (*locked.nodes).clone();

View File

@@ -1,11 +1,5 @@
use crate::{background_process, local_env::LocalEnv};
use camino::{Utf8Path, Utf8PathBuf};
use diesel::{
backend::Backend,
query_builder::{AstPass, QueryFragment, QueryId},
Connection, PgConnection, QueryResult, RunQueryDsl,
};
use diesel_migrations::{HarnessWithOutput, MigrationHarness};
use camino::Utf8PathBuf;
use hyper::Method;
use pageserver_api::{
models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
@@ -13,9 +7,9 @@ use pageserver_api::{
};
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{env, str::FromStr};
use tokio::process::Command;
use std::{path::PathBuf, process::Child, str::FromStr};
use tracing::instrument;
use utils::{
auth::{Claims, Scope},
@@ -25,17 +19,14 @@ use utils::{
pub struct AttachmentService {
env: LocalEnv,
listen: String,
path: Utf8PathBuf,
path: PathBuf,
jwt_token: Option<String>,
public_key_path: Option<Utf8PathBuf>,
postgres_port: u16,
client: reqwest::Client,
}
const COMMAND: &str = "attachment_service";
const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
@@ -178,9 +169,7 @@ pub struct TenantShardMigrateResponse {}
impl AttachmentService {
pub fn from_env(env: &LocalEnv) -> Self {
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
.unwrap()
.join("attachments.json");
let path = env.base_data_dir.join("attachments.json");
// Makes no sense to construct this if pageservers aren't going to use it: assume
// pageservers have control plane API set
@@ -192,13 +181,6 @@ impl AttachmentService {
listen_url.port().unwrap()
);
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
// port, for use by our captive postgres.
let postgres_port = listen_url
.port()
.expect("Control plane API setting should always have a port")
+ 1;
// Assume all pageservers have symmetric auth configuration: this service
// expects to use one JWT token to talk to all of them.
let ps_conf = env
@@ -227,7 +209,6 @@ impl AttachmentService {
listen,
jwt_token,
public_key_path,
postgres_port,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
@@ -239,214 +220,13 @@ impl AttachmentService {
.expect("non-Unicode path")
}
/// PIDFile for the postgres instance used to store attachment service state
fn postgres_pid_file(&self) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(
self.env
.base_data_dir
.join("attachment_service_postgres.pid"),
)
.expect("non-Unicode path")
}
pub async fn start(&self) -> anyhow::Result<Child> {
let path_str = self.path.to_string_lossy();
/// In order to access database migrations, we need to find the Neon source tree
async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
// We assume that either prd or our binary is in the source tree. The former is usually
// true for automated test runners, the latter is usually true for developer workstations. Often
// both are true, which is fine.
let candidate_start_points = [
// Current working directory
Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
// Directory containing the binary we're running inside
Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
];
// For each candidate start point, search through ancestors looking for a neon.git source tree root
for start_point in &candidate_start_points {
// Start from the build dir: assumes we are running out of a built neon source tree
for path in start_point.ancestors() {
// A crude approximation: the root of the source tree is whatever contains a "control_plane"
// subdirectory.
let control_plane = path.join("control_plane");
if tokio::fs::try_exists(&control_plane).await? {
return Ok(path.to_owned());
}
}
}
// Fall-through
Err(anyhow::anyhow!(
"Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
))
}
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
///
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
/// to other versions if that one isn't found. Some automated tests create circumstances
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
for v in prefer_versions {
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
if tokio::fs::try_exists(&path).await? {
return Ok(path);
}
}
// Fall through
anyhow::bail!(
"Postgres binaries not found in {}",
self.env.pg_distrib_dir.display()
);
}
/// Readiness check for our postgres process
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
let bin_path = pg_bin_dir.join("pg_isready");
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
Ok(exitcode.success())
}
/// Create our database if it doesn't exist, and run migrations.
///
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
/// who just want to run `cargo neon_local` without knowing about diesel.
///
/// Returns the database url
pub async fn setup_database(&self) -> anyhow::Result<String> {
let database_url = format!(
"postgresql://localhost:{}/attachment_service",
self.postgres_port
);
println!("Running attachment service database setup...");
fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
let base = ::url::Url::parse(database_url).unwrap();
let database = base.path_segments().unwrap().last().unwrap().to_owned();
let mut new_url = base.join(default_database).unwrap();
new_url.set_query(base.query());
(database, new_url.into())
}
#[derive(Debug, Clone)]
pub struct CreateDatabaseStatement {
db_name: String,
}
impl CreateDatabaseStatement {
pub fn new(db_name: &str) -> Self {
CreateDatabaseStatement {
db_name: db_name.to_owned(),
}
}
}
impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
out.push_sql("CREATE DATABASE ");
out.push_identifier(&self.db_name)?;
Ok(())
}
}
impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
impl QueryId for CreateDatabaseStatement {
type QueryId = ();
const HAS_STATIC_QUERY_ID: bool = false;
}
if PgConnection::establish(&database_url).is_err() {
let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
println!("Creating database: {database}");
let mut conn = PgConnection::establish(&postgres_url)?;
CreateDatabaseStatement::new(&database).execute(&mut conn)?;
}
let mut conn = PgConnection::establish(&database_url)?;
let migrations_dir = self
.find_source_root()
.await?
.join("control_plane/attachment_service/migrations");
let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
println!("Running migrations in {}", migrations.path().display());
HarnessWithOutput::write_to_stdout(&mut conn)
.run_pending_migrations(migrations)
.map(|_| ())
.map_err(|e| anyhow::anyhow!(e))?;
println!("Migrations complete");
Ok(database_url)
}
pub async fn start(&self) -> anyhow::Result<()> {
// Start a vanilla Postgres process used by the attachment service for persistence.
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
.unwrap()
.join("attachment_service_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_log_path = pg_data_path.join("postgres.log");
if !tokio::fs::try_exists(&pg_data_path).await? {
// Initialize empty database
let initdb_path = pg_bin_dir.join("initdb");
let mut child = Command::new(&initdb_path)
.args(["-D", pg_data_path.as_ref()])
.spawn()
.expect("Failed to spawn initdb");
let status = child.wait().await?;
if !status.success() {
anyhow::bail!("initdb failed with status {status}");
}
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}", self.postgres_port),
)
.await?;
};
println!("Starting attachment service database...");
let db_start_args = [
"-w",
"-D",
pg_data_path.as_ref(),
"-l",
pg_log_path.as_ref(),
"start",
];
background_process::start_process(
"attachment_service_db",
&self.env.base_data_dir,
pg_bin_dir.join("pg_ctl").as_std_path(),
db_start_args,
[],
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|| self.pg_isready(&pg_bin_dir),
)
.await?;
// Run migrations on every startup, in case something changed.
let database_url = self.setup_database().await?;
let mut args = vec![
"-l",
&self.listen,
"-p",
self.path.as_ref(),
"--database-url",
&database_url,
]
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
let mut args = vec!["-l", &self.listen, "-p", &path_str]
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
if let Some(jwt_token) = &self.jwt_token {
args.push(format!("--jwt-token={jwt_token}"));
}
@@ -455,7 +235,7 @@ impl AttachmentService {
args.push(format!("--public-key={public_key_path}"));
}
background_process::start_process(
let result = background_process::start_process(
COMMAND,
&self.env.base_data_dir,
&self.env.attachment_service_bin(),
@@ -472,46 +252,29 @@ impl AttachmentService {
}
},
)
.await?;
.await;
Ok(())
}
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
println!("Stopping attachment service database...");
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_stop_args)
.spawn()?
.wait()
for ps_conf in &self.env.pageservers {
let (pg_host, pg_port) =
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
self.node_register(NodeRegisterRequest {
node_id: ps_conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
if !stop_status.success() {
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_status_args)
.spawn()?
.wait()
.await?;
// pg_ctl status returns this exit code if postgres is not running: in this case it is
// fine that stop failed. Otherwise it is an error that stop failed.
const PG_STATUS_NOT_RUNNING: i32 = 3;
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
println!("Attachment service data base is already stopped");
return Ok(());
} else {
anyhow::bail!("Failed to stop attachment service database: {stop_status}")
}
}
Ok(())
result
}
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(immediate, COMMAND, &self.pid_file())
}
/// Simple HTTP request wrapper for calling into attachment service
async fn dispatch<RQ, RS>(
&self,
@@ -593,7 +356,7 @@ impl AttachmentService {
&self,
req: TenantCreateRequest,
) -> anyhow::Result<TenantCreateResponse> {
self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
self.dispatch(Method::POST, "tenant".to_string(), Some(req))
.await
}
@@ -650,7 +413,7 @@ impl AttachmentService {
) -> anyhow::Result<TimelineInfo> {
self.dispatch(
Method::POST,
format!("v1/tenant/{tenant_id}/timeline"),
format!("tenant/{tenant_id}/timeline"),
Some(req),
)
.await

View File

@@ -17,7 +17,7 @@ use std::io::Write;
use std::os::unix::prelude::AsRawFd;
use std::os::unix::process::CommandExt;
use std::path::Path;
use std::process::Command;
use std::process::{Child, Command};
use std::time::Duration;
use std::{fs, io, thread};
@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
envs: EI,
initial_pid_file: InitialPidFile,
process_status_check: F,
) -> anyhow::Result<()>
) -> anyhow::Result<Child>
where
F: Fn() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<bool>>,
@@ -98,7 +98,7 @@ where
InitialPidFile::Expect(path) => path,
};
let spawned_process = filled_cmd.spawn().with_context(|| {
let mut spawned_process = filled_cmd.spawn().with_context(|| {
format!("Could not spawn {process_name}, see console output and log files for details.")
})?;
let pid = spawned_process.id();
@@ -106,26 +106,12 @@ where
i32::try_from(pid)
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
);
// set up a scopeguard to kill & wait for the child in case we panic or bail below
let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
println!("SIGKILL & wait the started process");
(|| {
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
spawned_process.kill().context("SIGKILL child")?;
spawned_process.wait().context("wait() for child process")?;
anyhow::Ok(())
})()
.with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
.unwrap();
});
for retries in 0..RETRIES {
match process_started(pid, pid_file_to_check, &process_status_check).await {
Ok(true) => {
println!("\n{process_name} started and passed status check, pid: {pid}");
// leak the child process, it'll outlive this neon_local invocation
drop(scopeguard::ScopeGuard::into_inner(spawned_process));
return Ok(());
println!("\n{process_name} started, pid: {pid}");
return Ok(spawned_process);
}
Ok(false) => {
if retries == NOTICE_AFTER_RETRIES {
@@ -140,15 +126,16 @@ where
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
}
Err(e) => {
println!("error starting process {process_name:?}: {e:#}");
println!("{process_name} failed to start: {e:#}");
if let Err(e) = spawned_process.kill() {
println!("Could not stop {process_name} subprocess: {e:#}")
};
return Err(e);
}
}
}
println!();
anyhow::bail!(
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
);
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.

View File

@@ -135,7 +135,7 @@ fn main() -> Result<()> {
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(sub_args, &env)),
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
"stop" => handle_stop_all(sub_args, &env),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
@@ -1056,9 +1056,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
if let Err(e) = get_pageserver(env, subcommand_args)?
.start(&pageserver_config_overrides(subcommand_args), *register)
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
@@ -1087,7 +1086,24 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
}
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args), false)
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
}
}
Some(("migrate", subcommand_args)) => {
let pageserver = get_pageserver(env, subcommand_args)?;
//TODO what shutdown strategy should we use here?
if let Err(e) = pageserver.stop(false) {
eprintln!("pageserver stop failed: {}", e);
exit(1);
}
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
@@ -1145,7 +1161,7 @@ async fn handle_attachment_service(
.map(|s| s.as_str())
== Some("immediate");
if let Err(e) = svc.stop(immediate).await {
if let Err(e) = svc.stop(immediate) {
eprintln!("stop failed: {}", e);
exit(1);
}
@@ -1241,7 +1257,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
let attachment_service = AttachmentService::from_env(env);
if let Err(e) = attachment_service.start().await {
eprintln!("attachment_service start failed: {:#}", e);
try_stop_all(env, true).await;
try_stop_all(env, true);
exit(1);
}
}
@@ -1249,11 +1265,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver
.start(&pageserver_config_overrides(sub_match), true)
.start(&pageserver_config_overrides(sub_match))
.await
{
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true).await;
try_stop_all(env, true);
exit(1);
}
}
@@ -1262,23 +1278,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start(vec![]).await {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false).await;
try_stop_all(env, false);
exit(1);
}
}
Ok(())
}
async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let immediate =
sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
try_stop_all(env, immediate).await;
try_stop_all(env, immediate);
Ok(())
}
async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
// Stop all endpoints
match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => {
@@ -1313,7 +1329,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
if env.control_plane_api.is_some() {
let attachment_service = AttachmentService::from_env(env);
if let Err(e) = attachment_service.stop(immediate).await {
if let Err(e) = attachment_service.stop(immediate) {
eprintln!("attachment service stop failed: {e:#}");
}
}
@@ -1533,11 +1549,7 @@ fn cli() -> Command {
.subcommand(Command::new("status"))
.subcommand(Command::new("start")
.about("Start local pageserver")
.arg(pageserver_config_args.clone()).arg(Arg::new("register")
.long("register")
.default_value("true").required(false)
.value_parser(value_parser!(bool))
.value_name("register"))
.arg(pageserver_config_args.clone())
)
.subcommand(Command::new("stop")
.about("Stop local pageserver")

View File

@@ -438,7 +438,7 @@ impl Endpoint {
}
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
// TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
// TODO use background_process::stop_process instead
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
let pid = nix::unistd::Pid::from_raw(pid as i32);
@@ -583,21 +583,9 @@ impl Endpoint {
}
let child = cmd.spawn()?;
// set up a scopeguard to kill & wait for the child in case we panic or bail below
let child = scopeguard::guard(child, |mut child| {
println!("SIGKILL & wait the started process");
(|| {
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned
child.kill().context("SIGKILL child")?;
child.wait().context("wait() for child process")?;
anyhow::Ok(())
})()
.with_context(|| format!("scopeguard kill&wait child {child:?}"))
.unwrap();
});
// Write down the pid so we can wait for it when we want to stop
// TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
// TODO use background_process::start_process instead
let pid = child.id();
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
std::fs::write(pidfile_path, pid.to_string())?;
@@ -646,9 +634,6 @@ impl Endpoint {
std::thread::sleep(ATTEMPT_INTERVAL);
}
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
drop(scopeguard::ScopeGuard::into_inner(child));
Ok(())
}

View File

@@ -223,11 +223,7 @@ impl LocalEnv {
}
pub fn attachment_service_bin(&self) -> PathBuf {
// Irrespective of configuration, attachment service binary is always
// run from the same location as neon_local. This means that for compatibility
// tests that run old pageserver/safekeeper, they still run latest attachment service.
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
neon_local_bin_dir.join("attachment_service")
self.neon_distrib_dir.join("attachment_service")
}
pub fn safekeeper_bin(&self) -> PathBuf {

View File

@@ -11,7 +11,7 @@ use std::io;
use std::io::Write;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::Command;
use std::process::{Child, Command};
use std::time::Duration;
use anyhow::{bail, Context};
@@ -30,7 +30,6 @@ use utils::{
lsn::Lsn,
};
use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
use crate::local_env::PageServerConf;
use crate::{background_process, local_env::LocalEnv};
@@ -162,8 +161,8 @@ impl PageServerNode {
.expect("non-Unicode path")
}
pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
self.start_node(config_overrides, false, register).await
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
self.start_node(config_overrides, false).await
}
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -208,8 +207,7 @@ impl PageServerNode {
&self,
config_overrides: &[&str],
update_config: bool,
register: bool,
) -> anyhow::Result<()> {
) -> anyhow::Result<Child> {
// TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path();
print!(
@@ -246,26 +244,7 @@ impl PageServerNode {
}
},
)
.await?;
if register {
let attachment_service = AttachmentService::from_env(&self.env);
let (pg_host, pg_port) =
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
attachment_service
.node_register(NodeRegisterRequest {
node_id: self.conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
}
Ok(())
.await
}
fn pageserver_basic_args<'a>(

View File

@@ -7,6 +7,7 @@
//! ```
use std::io::Write;
use std::path::PathBuf;
use std::process::Child;
use std::{io, result};
use anyhow::Context;
@@ -103,7 +104,7 @@ impl SafekeeperNode {
.expect("non-Unicode path")
}
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
print!(
"Starting safekeeper at '{}' in '{}'",
self.pg_connection_config.raw_address(),

View File

@@ -1,9 +0,0 @@
# For documentation on how to configure this file,
# see https://diesel.rs/guides/configuring-diesel-cli
[print_schema]
file = "control_plane/attachment_service/src/schema.rs"
custom_type_derives = ["diesel::query_builder::QueryId"]
[migrations_directory]
dir = "control_plane/attachment_service/migrations"

View File

@@ -9,10 +9,5 @@ prometheus.workspace = true
libc.workspace = true
once_cell.workspace = true
chrono.workspace = true
twox-hash.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
rand = "0.8"
rand_distr = "0.4.3"

View File

@@ -1,523 +0,0 @@
//! HyperLogLog is an algorithm for the count-distinct problem,
//! approximating the number of distinct elements in a multiset.
//! Calculating the exact cardinality of the distinct elements
//! of a multiset requires an amount of memory proportional to
//! the cardinality, which is impractical for very large data sets.
//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
//! use significantly less memory than this, but can only approximate the cardinality.
use std::{
collections::HashMap,
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
sync::{atomic::AtomicU8, Arc, RwLock},
};
use prometheus::{
core::{self, Describer},
proto, Opts,
};
use twox_hash::xxh3;
/// Create an [`HyperLogLogVec`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_hll_vec {
($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
$crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
}};
($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
$crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
}};
}
/// Create an [`HyperLogLog`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_hll {
($N:literal, $OPTS:expr $(,)?) => {{
let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
$crate::register(Box::new(hll.clone())).map(|_| hll)
}};
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
}};
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLogVec<const N: usize> {
core: Arc<HyperLogLogVecCore<N>>,
}
struct HyperLogLogVecCore<const N: usize> {
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
pub desc: core::Desc,
pub opts: Opts,
}
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
for child in self.core.children.read().unwrap().values() {
child.core.collect_into(&mut metrics);
}
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogVec<N> {
/// Create a new [`HyperLogLogVec`] based on the provided
/// [`Opts`] and partitioned by the given label names. At least one label name must be
/// provided.
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
let opts = opts.variable_labels(variable_names);
let desc = opts.describe()?;
let v = HyperLogLogVecCore {
children: RwLock::new(HashMap::default()),
desc,
opts,
};
Ok(Self { core: Arc::new(v) })
}
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
self.core.get_metric_with_label_values(vals)
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
self.get_metric_with_label_values(vals).unwrap()
}
}
impl<const N: usize> HyperLogLogVecCore<N> {
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let h = self.hash_label_values(vals)?;
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
return Ok(metric);
}
self.get_or_create_metric(h, vals)
}
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
if vals.len() != self.desc.variable_labels.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: self.desc.variable_labels.len(),
got: vals.len(),
});
}
let mut h = xxh3::Hash64::default();
for val in vals {
h.write(val.as_bytes());
}
Ok(h.finish())
}
fn get_or_create_metric(
&self,
hash: u64,
label_values: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let mut children = self.children.write().unwrap();
// Check exist first.
if let Some(metric) = children.get(&hash).cloned() {
return Ok(metric);
}
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
children.insert(hash, metric.clone());
Ok(metric)
}
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLog<const N: usize> {
core: Arc<HyperLogLogCore<N>>,
}
impl<const N: usize> HyperLogLog<N> {
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let opts = Opts::new(name, help);
Self::with_opts(opts)
}
/// Create a [`HyperLogLog`] with the `opts` options.
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
Self::with_opts_and_label_values(&opts, &[])
}
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
let desc = opts.describe()?;
let labels = make_label_pairs(&desc, label_values)?;
let v = HyperLogLogCore {
shards: [0; N].map(AtomicU8::new),
desc,
labels,
};
Ok(Self { core: Arc::new(v) })
}
pub fn measure(&self, item: &impl Hash) {
// changing the hasher will break compatibility with previous measurements.
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
}
fn record(&self, hash: u64) {
let p = N.ilog2() as u8;
let j = hash & (N as u64 - 1);
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
}
}
struct HyperLogLogCore<const N: usize> {
shards: [AtomicU8; N],
desc: core::Desc,
labels: Vec<proto::LabelPair>,
}
impl<const N: usize> core::Collector for HyperLogLog<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
self.core.collect_into(&mut metrics);
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogCore<N> {
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
self.shards.iter().enumerate().for_each(|(i, x)| {
let mut shard_label = proto::LabelPair::default();
shard_label.set_name("hll_shard".to_owned());
shard_label.set_value(format!("{i}"));
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
// This seems like it would be a race condition,
// but HLL is not impacted by a write in one shard happening in between.
// This is because in PromQL we will be implementing a harmonic mean of all buckets.
// we will also merge samples in a time series using `max by (hll_shard)`.
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
let mut m = proto::Metric::default();
let mut c = proto::Gauge::default();
c.set_value(v as f64);
m.set_gauge(c);
let mut labels = Vec::with_capacity(self.labels.len() + 1);
labels.extend_from_slice(&self.labels);
labels.push(shard_label);
m.set_label(labels);
metrics.push(m);
})
}
}
fn make_label_pairs(
desc: &core::Desc,
label_values: &[&str],
) -> prometheus::Result<Vec<proto::LabelPair>> {
if desc.variable_labels.len() != label_values.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: desc.variable_labels.len(),
got: label_values.len(),
});
}
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
if total_len == 0 {
return Ok(vec![]);
}
if desc.variable_labels.is_empty() {
return Ok(desc.const_label_pairs.clone());
}
let mut label_pairs = Vec::with_capacity(total_len);
for (i, n) in desc.variable_labels.iter().enumerate() {
let mut label_pair = proto::LabelPair::default();
label_pair.set_name(n.clone());
label_pair.set_value(label_values[i].to_owned());
label_pairs.push(label_pair);
}
for label_pair in &desc.const_label_pairs {
label_pairs.push(label_pair.clone());
}
label_pairs.sort();
Ok(label_pairs)
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use prometheus::{proto, Opts};
use rand::{rngs::StdRng, Rng, SeedableRng};
use rand_distr::{Distribution, Zipf};
use crate::HyperLogLogVec;
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
let mut metrics = vec![];
hll.core
.children
.read()
.unwrap()
.values()
.for_each(|c| c.core.collect_into(&mut metrics));
metrics
}
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
let mut buckets = [0.0; 32];
for metric in metrics.chunks_exact(32) {
if filter(&metric[0]) {
for (i, m) in metric.iter().enumerate() {
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
}
}
}
buckets
.into_iter()
.map(|f| 2.0f64.powf(-f))
.sum::<f64>()
.recip()
* 0.697
* 32.0
* 32.0
}
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
let mut set_a = HashSet::new();
let mut set_b = HashSet::new();
for x in iter.by_ref().take(n) {
set_a.insert(x.to_bits());
hll.with_label_values(&["a"]).measure(&x.to_bits());
}
for x in iter.by_ref().take(n) {
set_b.insert(x.to_bits());
hll.with_label_values(&["b"]).measure(&x.to_bits());
}
let merge = &set_a | &set_b;
let metrics = collect(&hll);
let len = get_cardinality(&metrics, |_| true);
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
}
#[test]
fn test_cardinality_small() {
let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
assert_eq!(actual, [46, 30, 32]);
assert!(51.3 < estimate[0] && estimate[0] < 51.4);
assert!(44.0 < estimate[1] && estimate[1] < 44.1);
assert!(39.0 < estimate[2] && estimate[2] < 39.1);
}
#[test]
fn test_cardinality_medium() {
let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
assert_eq!(actual, [2529, 1618, 1629]);
assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
}
#[test]
fn test_cardinality_large() {
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
assert_eq!(actual, [129077, 79579, 79630]);
assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
}
#[test]
fn test_cardinality_small2() {
let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
assert_eq!(actual, [92, 58, 60]);
assert!(116.1 < estimate[0] && estimate[0] < 116.2);
assert!(81.7 < estimate[1] && estimate[1] < 81.8);
assert!(69.3 < estimate[2] && estimate[2] < 69.4);
}
#[test]
fn test_cardinality_medium2() {
let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
assert_eq!(actual, [8201, 5131, 5051]);
assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
}
#[test]
fn test_cardinality_large2() {
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
assert_eq!(actual, [777847, 482069, 482246]);
assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
}
}

View File

@@ -28,9 +28,7 @@ use prometheus::{Registry, Result};
pub mod launch_timestamp;
mod wrappers;
pub use wrappers::{CountedReader, CountedWriter};
mod hll;
pub mod metric_vec_duration;
pub use hll::{HyperLogLog, HyperLogLogVec};
pub type UIntGauge = GenericGauge<AtomicU64>;
pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;

View File

@@ -20,6 +20,7 @@ strum_macros.workspace = true
hex.workspace = true
thiserror.workspace = true
humantime-serde.workspace = true
itertools.workspace = true
workspace_hack.workspace = true

View File

@@ -2,6 +2,7 @@ use postgres_ffi::BLCKSZ;
use std::ops::Range;
use crate::key::Key;
use itertools::Itertools;
///
/// Represents a set of Keys, in a compact form.
@@ -63,16 +64,93 @@ impl KeySpace {
KeyPartitioning { parts }
}
pub fn merge(&mut self, other: &KeySpace) {
let all_ranges = self
.ranges
.iter()
.merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start);
let mut accum = KeySpaceAccum::new();
let mut prev: Option<&Range<Key>> = None;
for range in all_ranges {
if let Some(prev) = prev {
let overlap =
std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
assert!(
!overlap,
"Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
prev, range
);
}
accum.add_range(range.clone());
prev = Some(range);
}
self.ranges = accum.to_keyspace().ranges;
}
/// Update the keyspace such that it doesn't contain any range
/// that is overlapping with `other`. This can involve splitting or
/// removing of existing ranges.
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
for range in &other.ranges {
while let Some(overlap_at) = self.overlaps_at(range) {
let overlapped = self.ranges[overlap_at].clone();
if overlapped.start < range.start && overlapped.end <= range.end {
// Higher part of the range is completely overlapped.
self.ranges[overlap_at].end = range.start;
}
if overlapped.start >= range.start && overlapped.end > range.end {
// Lower part of the range is completely overlapped.
self.ranges[overlap_at].start = range.end;
}
if overlapped.start < range.start && overlapped.end > range.end {
// Middle part of the range is overlapped.
self.ranges[overlap_at].end = range.start;
self.ranges
.insert(overlap_at + 1, range.end..overlapped.end);
}
if overlapped.start >= range.start && overlapped.end <= range.end {
// Whole range is overlapped
self.ranges.remove(overlap_at);
}
}
}
}
pub fn start(&self) -> Option<Key> {
self.ranges.first().map(|range| range.start)
}
pub fn end(&self) -> Option<Key> {
self.ranges.last().map(|range| range.end)
}
#[allow(unused)]
pub fn total_size(&self) -> usize {
self.ranges
.iter()
.map(|range| key_range_size(range) as usize)
.sum()
}
fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
Ok(0) => None,
Err(0) => None,
Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
_ => None,
}
}
///
/// Check if key space contains overlapping range
///
pub fn overlaps(&self, range: &Range<Key>) -> bool {
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
Ok(0) => false,
Err(0) => false,
Ok(index) => self.ranges[index - 1].end > range.start,
Err(index) => self.ranges[index - 1].end > range.start,
}
self.overlaps_at(range).is_some()
}
}
@@ -441,4 +519,104 @@ mod tests {
// xxxxxxxxxxx
assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
}
#[test]
fn test_remove_overlapping_with() {
// Test full overlaps
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(4),
Key::from_i128(5)..Key::from_i128(8),
Key::from_i128(10)..Key::from_i128(12),
],
};
let key_space2 = KeySpace {
ranges: vec![
Key::from_i128(2)..Key::from_i128(3),
Key::from_i128(6)..Key::from_i128(7),
Key::from_i128(11)..Key::from_i128(13),
],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(2),
Key::from_i128(3)..Key::from_i128(4),
Key::from_i128(5)..Key::from_i128(6),
Key::from_i128(7)..Key::from_i128(8),
Key::from_i128(10)..Key::from_i128(11)
]
);
// Test partial ovelaps
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(5),
Key::from_i128(7)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
],
};
let key_space2 = KeySpace {
ranges: vec![
Key::from_i128(3)..Key::from_i128(6),
Key::from_i128(8)..Key::from_i128(11),
Key::from_i128(14)..Key::from_i128(17),
],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(3),
Key::from_i128(7)..Key::from_i128(8),
Key::from_i128(12)..Key::from_i128(14),
]
);
// Test no overlaps
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(5),
Key::from_i128(7)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
],
};
let key_space2 = KeySpace {
ranges: vec![
Key::from_i128(6)..Key::from_i128(7),
Key::from_i128(11)..Key::from_i128(12),
Key::from_i128(15)..Key::from_i128(17),
],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(5),
Key::from_i128(7)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
]
);
// Test one range overlaps multiple
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(5),
Key::from_i128(7)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
],
};
let key_space2 = KeySpace {
ranges: vec![Key::from_i128(4)..Key::from_i128(14)],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(4),
Key::from_i128(14)..Key::from_i128(15),
]
);
}
}

View File

@@ -8,7 +8,6 @@ use std::pin::Pin;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use std::time::SystemTime;
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
use anyhow::Result;
@@ -24,7 +23,6 @@ use futures::stream::Stream;
use futures_util::StreamExt;
use http_types::{StatusCode, Url};
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use crate::s3_bucket::RequestKind;
@@ -185,6 +183,7 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
}
}
#[async_trait::async_trait]
impl RemoteStorage for AzureBlobStorage {
async fn list(
&self,
@@ -372,20 +371,6 @@ impl RemoteStorage for AzureBlobStorage {
copy_status = status;
}
}
async fn time_travel_recover(
&self,
_prefix: Option<&RemotePath>,
_timestamp: SystemTime,
_done_if_after: SystemTime,
_cancel: CancellationToken,
) -> anyhow::Result<()> {
// TODO use Azure point in time recovery feature for this
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
Err(anyhow::anyhow!(
"time travel recovery for azure blob storage is not implemented"
))
}
}
pin_project_lite::pin_project! {

View File

@@ -25,7 +25,6 @@ use bytes::Bytes;
use futures::stream::Stream;
use serde::{Deserialize, Serialize};
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use toml_edit::Item;
use tracing::info;
@@ -143,7 +142,7 @@ pub struct Listing {
/// Storage (potentially remote) API to manage its state.
/// This storage tries to be unaware of any layered repository context,
/// providing basic CRUD operations for storage files.
#[allow(async_fn_in_trait)]
#[async_trait::async_trait]
pub trait RemoteStorage: Send + Sync + 'static {
/// Lists all top level subdirectories for a given prefix
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
@@ -211,15 +210,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
/// Copy a remote object inside a bucket from one path to another.
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
/// Resets the content of everything with the given prefix to the given state
async fn time_travel_recover(
&self,
prefix: Option<&RemotePath>,
timestamp: SystemTime,
done_if_after: SystemTime,
cancel: CancellationToken,
) -> anyhow::Result<()>;
}
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -272,15 +262,14 @@ impl std::error::Error for DownloadError {}
/// Every storage, currently supported.
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
#[derive(Clone)]
// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
pub enum GenericRemoteStorage {
LocalFs(LocalFs),
AwsS3(Arc<S3Bucket>),
AzureBlob(Arc<AzureBlobStorage>),
Unreliable(Other),
Unreliable(Arc<UnreliableWrapper>),
}
impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
impl GenericRemoteStorage {
pub async fn list(
&self,
prefix: Option<&RemotePath>,
@@ -397,33 +386,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
Self::Unreliable(s) => s.copy(from, to).await,
}
}
pub async fn time_travel_recover(
&self,
prefix: Option<&RemotePath>,
timestamp: SystemTime,
done_if_after: SystemTime,
cancel: CancellationToken,
) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => {
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
Self::AwsS3(s) => {
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
Self::AzureBlob(s) => {
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
Self::Unreliable(s) => {
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
}
}
}
impl GenericRemoteStorage {
@@ -711,7 +673,6 @@ impl ConcurrencyLimiter {
RequestKind::List => &self.read,
RequestKind::Delete => &self.write,
RequestKind::Copy => &self.write,
RequestKind::TimeTravel => &self.write,
}
}

View File

@@ -4,7 +4,7 @@
//! This storage used in tests, but can also be used in cases when a certain persistent
//! volume is mounted to the local FS.
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
use anyhow::{bail, ensure, Context};
use bytes::Bytes;
@@ -14,7 +14,7 @@ use tokio::{
fs,
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
};
use tokio_util::{io::ReaderStream, sync::CancellationToken};
use tokio_util::io::ReaderStream;
use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
@@ -157,6 +157,7 @@ impl LocalFs {
}
}
#[async_trait::async_trait]
impl RemoteStorage for LocalFs {
async fn list(
&self,
@@ -422,17 +423,6 @@ impl RemoteStorage for LocalFs {
})?;
Ok(())
}
#[allow(clippy::diverging_sub_expression)]
async fn time_travel_recover(
&self,
_prefix: Option<&RemotePath>,
_timestamp: SystemTime,
_done_if_after: SystemTime,
_cancel: CancellationToken,
) -> anyhow::Result<()> {
unimplemented!()
}
}
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {

View File

@@ -6,14 +6,12 @@
use std::{
borrow::Cow,
collections::HashMap,
pin::Pin,
sync::Arc,
task::{Context, Poll},
time::SystemTime,
};
use anyhow::{anyhow, Context as _};
use anyhow::Context as _;
use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider,
@@ -29,19 +27,17 @@ use aws_sdk_s3::{
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
error::SdkError,
operation::get_object::GetObjectError,
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
types::{Delete, ObjectIdentifier},
Client,
};
use aws_smithy_async::rt::sleep::TokioSleep;
use aws_smithy_types::body::SdkBody;
use aws_smithy_types::byte_stream::ByteStream;
use aws_smithy_types::{body::SdkBody, DateTime};
use bytes::Bytes;
use futures::stream::Stream;
use hyper::Body;
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use utils::backoff;
use super::StorageMetadata;
use crate::{
@@ -274,59 +270,6 @@ impl S3Bucket {
}
}
}
async fn delete_oids(
&self,
kind: RequestKind,
delete_objects: &[ObjectIdentifier],
) -> anyhow::Result<()> {
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
let started_at = start_measuring_requests(kind);
let resp = self
.client
.delete_objects()
.bucket(self.bucket_name.clone())
.delete(
Delete::builder()
.set_objects(Some(chunk.to_vec()))
.build()?,
)
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &resp, started_at);
let resp = resp?;
metrics::BUCKET_METRICS
.deleted_objects_total
.inc_by(chunk.len() as u64);
if let Some(errors) = resp.errors {
// Log a bounded number of the errors within the response:
// these requests can carry 1000 keys so logging each one
// would be too verbose, especially as errors may lead us
// to retry repeatedly.
const LOG_UP_TO_N_ERRORS: usize = 10;
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
tracing::warn!(
"DeleteObjects key {} failed: {}: {}",
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
e.message.as_ref().map(Cow::from).unwrap_or("".into())
);
}
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
));
}
}
Ok(())
}
}
pin_project_lite::pin_project! {
@@ -430,6 +373,7 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
}
}
#[async_trait::async_trait]
impl RemoteStorage for S3Bucket {
async fn list(
&self,
@@ -625,195 +569,64 @@ impl RemoteStorage for S3Bucket {
delete_objects.push(obj_id);
}
self.delete_oids(kind, &delete_objects).await
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
let started_at = start_measuring_requests(kind);
let resp = self
.client
.delete_objects()
.bucket(self.bucket_name.clone())
.delete(
Delete::builder()
.set_objects(Some(chunk.to_vec()))
.build()?,
)
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &resp, started_at);
match resp {
Ok(resp) => {
metrics::BUCKET_METRICS
.deleted_objects_total
.inc_by(chunk.len() as u64);
if let Some(errors) = resp.errors {
// Log a bounded number of the errors within the response:
// these requests can carry 1000 keys so logging each one
// would be too verbose, especially as errors may lead us
// to retry repeatedly.
const LOG_UP_TO_N_ERRORS: usize = 10;
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
tracing::warn!(
"DeleteObjects key {} failed: {}: {}",
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
e.message.as_ref().map(Cow::from).unwrap_or("".into())
);
}
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
));
}
}
Err(e) => {
return Err(e.into());
}
}
}
Ok(())
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let paths = std::array::from_ref(path);
self.delete_objects(paths).await
}
async fn time_travel_recover(
&self,
prefix: Option<&RemotePath>,
timestamp: SystemTime,
done_if_after: SystemTime,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::TimeTravel;
let _guard = self.permit(kind).await;
let timestamp = DateTime::from(timestamp);
let done_if_after = DateTime::from(done_if_after);
tracing::info!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
// get the passed prefix or if it is not set use prefix_in_bucket value
let prefix = prefix
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| self.prefix_in_bucket.clone());
let warn_threshold = 3;
let max_retries = 10;
let is_permanent = |_e: &_| false;
let mut key_marker = None;
let mut version_id_marker = None;
let mut versions_and_deletes = Vec::new();
loop {
let response = backoff::retry(
|| async {
Ok(self
.client
.list_object_versions()
.bucket(self.bucket_name.clone())
.set_prefix(prefix.clone())
.set_key_marker(key_marker.clone())
.set_version_id_marker(version_id_marker.clone())
.send()
.await?)
},
is_permanent,
warn_threshold,
max_retries,
"listing object versions for time_travel_recover",
backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
)
.await?;
tracing::trace!(
" Got List response version_id_marker={:?}, key_marker={:?}",
response.version_id_marker,
response.key_marker
);
let versions = response.versions.unwrap_or_default();
let delete_markers = response.delete_markers.unwrap_or_default();
let new_versions = versions.into_iter().map(VerOrDelete::Version);
let new_deletes = delete_markers.into_iter().map(VerOrDelete::DeleteMarker);
let new_versions_and_deletes = new_versions.chain(new_deletes);
versions_and_deletes.extend(new_versions_and_deletes);
fn none_if_empty(v: Option<String>) -> Option<String> {
v.filter(|v| !v.is_empty())
}
version_id_marker = none_if_empty(response.next_version_id_marker);
key_marker = none_if_empty(response.next_key_marker);
if version_id_marker.is_none() {
// The final response is not supposed to be truncated
if response.is_truncated.unwrap_or_default() {
anyhow::bail!(
"Received truncated ListObjectVersions response for prefix={prefix:?}"
);
}
break;
}
}
// Work on the list of references instead of the objects directly,
// otherwise we get lifetime errors in the sort_by_key call below.
let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
versions_and_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
let mut vds_for_key = HashMap::<_, Vec<_>>::new();
for vd in &versions_and_deletes {
let last_modified = vd.last_modified();
let version_id = vd.version_id();
let key = vd.key();
let (Some(last_modified), Some(version_id), Some(key)) =
(last_modified, version_id, key)
else {
anyhow::bail!(
"One (or more) of last_modified, key, and id is None. \
Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
last_modified, key, version_id,
);
};
if version_id == "null" {
anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
indicating either disabled versioning, or legacy objects with null version id values");
}
tracing::trace!(
"Parsing version key={key} version_id={version_id} is_delete={}",
matches!(vd, VerOrDelete::DeleteMarker(_))
);
vds_for_key
.entry(key)
.or_default()
.push((vd, last_modified, version_id));
}
for (key, versions) in vds_for_key {
let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
if last_last_modified > &&done_if_after {
tracing::trace!("Key {key} has version later than done_if_after, skipping");
continue;
}
// the version we want to restore to.
let version_to_restore_to =
match versions.binary_search_by_key(&timestamp, |tpl| *tpl.1) {
Ok(v) => v,
Err(e) => e,
};
if version_to_restore_to == versions.len() {
tracing::trace!("Key {key} has no changes since timestamp, skipping");
continue;
}
let mut do_delete = false;
if version_to_restore_to == 0 {
// All versions more recent, so the key didn't exist at the specified time point.
tracing::trace!(
"All {} versions more recent for {key}, deleting",
versions.len()
);
do_delete = true;
} else {
match &versions[version_to_restore_to - 1] {
(VerOrDelete::Version(_), _last_modified, version_id) => {
tracing::trace!("Copying old version {version_id} for {key}...");
// Restore the state to the last version by copying
let source_id =
format!("{}/{key}?versionId={version_id}", self.bucket_name);
backoff::retry(
|| async {
Ok(self
.client
.copy_object()
.bucket(self.bucket_name.clone())
.key(key)
.copy_source(&source_id)
.send()
.await?)
},
is_permanent,
warn_threshold,
max_retries,
"listing object versions for time_travel_recover",
backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
)
.await?;
}
(VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
do_delete = true;
}
}
};
if do_delete {
if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
// Key has since been deleted (but there was some history), no need to do anything
tracing::trace!("Key {key} already deleted, skipping.");
} else {
tracing::trace!("Deleting {key}...");
let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
self.delete_oids(kind, &[oid]).await?;
}
}
}
Ok(())
}
}
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
@@ -838,32 +651,6 @@ fn start_measuring_requests(
})
}
enum VerOrDelete {
Version(ObjectVersion),
DeleteMarker(DeleteMarkerEntry),
}
impl VerOrDelete {
fn last_modified(&self) -> Option<&DateTime> {
match self {
VerOrDelete::Version(v) => v.last_modified(),
VerOrDelete::DeleteMarker(v) => v.last_modified(),
}
}
fn version_id(&self) -> Option<&str> {
match self {
VerOrDelete::Version(v) => v.version_id(),
VerOrDelete::DeleteMarker(v) => v.version_id(),
}
}
fn key(&self) -> Option<&str> {
match self {
VerOrDelete::Version(v) => v.key(),
VerOrDelete::DeleteMarker(v) => v.key(),
}
}
}
#[cfg(test)]
mod tests {
use camino::Utf8Path;

View File

@@ -12,7 +12,6 @@ pub(crate) enum RequestKind {
Delete = 2,
List = 3,
Copy = 4,
TimeTravel = 5,
}
use RequestKind::*;
@@ -25,7 +24,6 @@ impl RequestKind {
Delete => "delete_object",
List => "list_objects",
Copy => "copy_object",
TimeTravel => "time_travel_recover",
}
}
const fn as_index(&self) -> usize {
@@ -33,7 +31,7 @@ impl RequestKind {
}
}
pub(super) struct RequestTyped<C>([C; 6]);
pub(super) struct RequestTyped<C>([C; 5]);
impl<C> RequestTyped<C> {
pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -42,8 +40,8 @@ impl<C> RequestTyped<C> {
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
use RequestKind::*;
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
let arr = std::array::from_fn::<C, 6, _>(|index| {
let mut it = [Get, Put, Delete, List, Copy].into_iter();
let arr = std::array::from_fn::<C, 5, _>(|index| {
let next = it.next().unwrap();
assert_eq!(index, next.as_index());
f(next)

View File

@@ -3,19 +3,16 @@
//! testing purposes.
use bytes::Bytes;
use futures::stream::Stream;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Mutex;
use std::time::SystemTime;
use std::{collections::hash_map::Entry, sync::Arc};
use tokio_util::sync::CancellationToken;
use crate::{
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
StorageMetadata,
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
};
pub struct UnreliableWrapper {
inner: GenericRemoteStorage<Arc<VoidStorage>>,
inner: crate::GenericRemoteStorage,
// This many attempts of each operation will fail, then we let it succeed.
attempts_to_fail: u64,
@@ -32,21 +29,11 @@ enum RemoteOp {
Download(RemotePath),
Delete(RemotePath),
DeleteObjects(Vec<RemotePath>),
TimeTravelRecover(Option<RemotePath>),
}
impl UnreliableWrapper {
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
assert!(attempts_to_fail > 0);
let inner = match inner {
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
// We could also make this a no-op, as in, extract the inner of the passed generic remote storage
GenericRemoteStorage::Unreliable(_s) => {
panic!("Can't wrap unreliable wrapper unreliably")
}
};
UnreliableWrapper {
inner,
attempts_to_fail,
@@ -97,9 +84,7 @@ impl UnreliableWrapper {
}
}
// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
type VoidStorage = crate::LocalFs;
#[async_trait::async_trait]
impl RemoteStorage for UnreliableWrapper {
async fn list_prefixes(
&self,
@@ -184,17 +169,4 @@ impl RemoteStorage for UnreliableWrapper {
self.attempt(RemoteOp::Upload(to.clone()))?;
self.inner.copy_object(from, to).await
}
async fn time_travel_recover(
&self,
prefix: Option<&RemotePath>,
timestamp: SystemTime,
done_if_after: SystemTime,
cancel: CancellationToken,
) -> anyhow::Result<()> {
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
self.inner
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
}

View File

@@ -1,21 +1,15 @@
use std::collections::HashSet;
use std::env;
use std::fmt::{Debug, Display};
use std::num::NonZeroUsize;
use std::ops::ControlFlow;
use std::sync::Arc;
use std::time::{Duration, UNIX_EPOCH};
use std::{collections::HashSet, time::SystemTime};
use std::time::UNIX_EPOCH;
use crate::common::{download_to_vec, upload_stream};
use anyhow::Context;
use camino::Utf8Path;
use futures_util::Future;
use remote_storage::{
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
};
use test_context::test_context;
use test_context::AsyncTestContext;
use tokio_util::sync::CancellationToken;
use tracing::info;
mod common;
@@ -24,160 +18,11 @@ mod common;
mod tests_s3;
use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
use utils::backoff;
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
const BASE_PREFIX: &str = "test";
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledStorage::Enabled(ctx) => ctx,
MaybeEnabledStorage::Disabled => return Ok(()),
};
// Our test depends on discrepancies in the clock between S3 and the environment the tests
// run in. Therefore, wait a little bit before and after. The alternative would be
// to take the time from S3 response headers.
const WAIT_TIME: Duration = Duration::from_millis(3_000);
async fn retry<T, O, F, E>(op: O) -> Result<T, E>
where
E: Display + Debug + 'static,
O: FnMut() -> F,
F: Future<Output = Result<T, E>>,
{
let warn_threshold = 3;
let max_retries = 10;
backoff::retry(
op,
|_e| false,
warn_threshold,
max_retries,
"test retry",
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await
}
async fn time_point() -> SystemTime {
tokio::time::sleep(WAIT_TIME).await;
let ret = SystemTime::now();
tokio::time::sleep(WAIT_TIME).await;
ret
}
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
Ok(retry(|| client.list_files(None))
.await
.context("list root files failure")?
.into_iter()
.collect::<HashSet<_>>())
}
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
retry(|| {
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
ctx.client.upload(data, len, &path1, None)
})
.await?;
let t0_files = list_files(&ctx.client).await?;
let t0 = time_point().await;
println!("at t0: {t0_files:?}");
let old_data = "remote blob data2";
retry(|| {
let (data, len) = upload_stream(old_data.as_bytes().into());
ctx.client.upload(data, len, &path2, None)
})
.await?;
let t1_files = list_files(&ctx.client).await?;
let t1 = time_point().await;
println!("at t1: {t1_files:?}");
// A little check to ensure that our clock is not too far off from the S3 clock
{
let dl = retry(|| ctx.client.download(&path2)).await?;
let last_modified = dl.last_modified.unwrap();
let half_wt = WAIT_TIME.mul_f32(0.5);
let t0_hwt = t0 + half_wt;
let t1_hwt = t1 - half_wt;
if !(t0_hwt..=t1_hwt).contains(&last_modified) {
panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
This likely means a large lock discrepancy between S3 and the local clock.");
}
}
retry(|| {
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
ctx.client.upload(data, len, &path3, None)
})
.await?;
let new_data = "new remote blob data2";
retry(|| {
let (data, len) = upload_stream(new_data.as_bytes().into());
ctx.client.upload(data, len, &path2, None)
})
.await?;
retry(|| ctx.client.delete(&path1)).await?;
let t2_files = list_files(&ctx.client).await?;
let t2 = time_point().await;
println!("at t2: {t2_files:?}");
// No changes after recovery to t2 (no-op)
let t_final = time_point().await;
ctx.client
.time_travel_recover(None, t2, t_final, CancellationToken::new())
.await?;
let t2_files_recovered = list_files(&ctx.client).await?;
println!("after recovery to t2: {t2_files_recovered:?}");
assert_eq!(t2_files, t2_files_recovered);
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
assert_eq!(path2_recovered_t2, new_data.as_bytes());
// after recovery to t1: path1 is back, path2 has the old content
let t_final = time_point().await;
ctx.client
.time_travel_recover(None, t1, t_final, CancellationToken::new())
.await?;
let t1_files_recovered = list_files(&ctx.client).await?;
println!("after recovery to t1: {t1_files_recovered:?}");
assert_eq!(t1_files, t1_files_recovered);
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
assert_eq!(path2_recovered_t1, old_data.as_bytes());
// after recovery to t0: everything is gone except for path1
let t_final = time_point().await;
ctx.client
.time_travel_recover(None, t0, t_final, CancellationToken::new())
.await?;
let t0_files_recovered = list_files(&ctx.client).await?;
println!("after recovery to t0: {t0_files_recovered:?}");
assert_eq!(t0_files, t0_files_recovered);
// cleanup
let paths = &[path1, path2, path3];
retry(|| ctx.client.delete_objects(paths)).await?;
Ok(())
}
struct EnabledS3 {
client: Arc<GenericRemoteStorage>,
base_prefix: &'static str,

View File

@@ -1,7 +1,7 @@
use std::{
borrow::Cow,
fs::{self, File},
io,
io::{self, Write},
};
use camino::{Utf8Path, Utf8PathBuf};
@@ -112,6 +112,48 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
tokio::fs::File::open(path.as_ref()).await?.sync_all().await
}
/// Writes a file to the specified `final_path` in a crash safe fasion
///
/// The file is first written to the specified tmp_path, and in a second
/// step, the tmp path is renamed to the final path. As renames are
/// atomic, a crash during the write operation will never leave behind a
/// partially written file.
///
/// NB: an async variant of this code exists in Pageserver's VirtualFile.
pub fn overwrite(
final_path: &Utf8Path,
tmp_path: &Utf8Path,
content: &[u8],
) -> std::io::Result<()> {
let Some(final_path_parent) = final_path.parent() else {
return Err(std::io::Error::from_raw_os_error(
nix::errno::Errno::EINVAL as i32,
));
};
std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
let mut file = std::fs::OpenOptions::new()
.write(true)
// Use `create_new` so that, if we race with ourselves or something else,
// we bail out instead of causing damage.
.create_new(true)
.open(tmp_path)?;
file.write_all(content)?;
file.sync_all()?;
drop(file); // before the rename, that's important!
// renames are atomic
std::fs::rename(tmp_path, final_path)?;
// Only open final path parent dirfd now, so that this operation only
// ever holds one VirtualFile fd at a time. That's important because
// the current `find_victim_slot` impl might pick the same slot for both
// VirtualFile., and it eventually does a blocking write lock instead of
// try_lock.
let final_parent_dirfd = std::fs::OpenOptions::new()
.read(true)
.open(final_path_parent)?;
final_parent_dirfd.sync_all()?;
Ok(())
}
#[cfg(test)]
mod tests {

View File

@@ -131,9 +131,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
_ => info!("Error processing HTTP request: {api_error:#}"),
_ => error!("Error processing HTTP request: {api_error:#}"),
}
api_error.into_response()

View File

@@ -61,7 +61,6 @@ sync_wrapper.workspace = true
tokio-tar.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
tokio-epoll-uring.workspace = true
tokio-io-timeout.workspace = true
tokio-postgres.workspace = true
tokio-stream.workspace = true

View File

@@ -18,7 +18,7 @@ use pageserver::tenant::block_io::FileBlockReader;
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
use pageserver::tenant::storage_layer::range_overlaps;
use pageserver::virtual_file::{self, VirtualFile};
use pageserver::virtual_file::VirtualFile;
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
pageserver::virtual_file::init(10);
pageserver::page_cache::init(100);
let mut total_delta_layers = 0usize;

View File

@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
virtual_file::init(10);
page_cache::init(100);
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let summary_blk = file.read_blk(0, ctx).await?;
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
new_tenant_id,
new_timeline_id,
} => {
pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
pageserver::virtual_file::init(10);
pageserver::page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

View File

@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
// Basic initialization of things that don't change after startup
virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
virtual_file::init(10);
page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
dump_layerfile_from_path(path, true, &ctx).await

View File

@@ -262,10 +262,26 @@ where
let blocks = self
.timeline
.get_vectored(&part.ranges, self.lsn, self.ctx)
.await?;
.await;
let blocks = match blocks {
Ok(blocks) => blocks,
Err(err) => {
error!("get_vectored failed: {}", err);
return Err(anyhow!(err));
}
};
for (key, block) in blocks {
slru_builder.add_block(&key, block?).await?;
let block = match block {
Ok(block) => block,
Err(err) => {
error!("get_vectored failed on key {}: {}", key, err);
return Err(anyhow!(err));
}
};
slru_builder.add_block(&key, block).await?;
}
}

View File

@@ -130,7 +130,7 @@ fn main() -> anyhow::Result<()> {
let scenario = failpoint_support::init();
// Basic initialization of things that don't change after startup
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
virtual_file::init(conf.max_file_descriptors);
page_cache::init(conf.page_cache_size);
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;

View File

@@ -36,7 +36,6 @@ use crate::tenant::config::TenantConfOpt;
use crate::tenant::{
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
};
use crate::virtual_file;
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
@@ -44,8 +43,6 @@ use crate::{
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
pub mod defaults {
use crate::tenant::config::defaults::*;
use const_format::formatcp;
@@ -82,8 +79,6 @@ pub mod defaults {
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
///
/// Default built-in configuration file.
///
@@ -119,8 +114,6 @@ pub mod defaults {
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
[tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -254,8 +247,6 @@ pub struct PageServerConf {
/// Maximum number of WAL records to be ingested and committed at the same time
pub ingest_batch_size: u64,
pub virtual_file_io_engine: virtual_file::IoEngineKind,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -340,8 +331,6 @@ struct PageServerConfigBuilder {
secondary_download_concurrency: BuilderValue<usize>,
ingest_batch_size: BuilderValue<u64>,
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
}
impl Default for PageServerConfigBuilder {
@@ -417,8 +406,6 @@ impl Default for PageServerConfigBuilder {
secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
}
}
}
@@ -575,10 +562,6 @@ impl PageServerConfigBuilder {
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
}
pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
self.virtual_file_io_engine = BuilderValue::Set(value);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_warmup = self
.concurrent_tenant_warmup
@@ -686,9 +669,6 @@ impl PageServerConfigBuilder {
ingest_batch_size: self
.ingest_batch_size
.ok_or(anyhow!("missing ingest_batch_size"))?,
virtual_file_io_engine: self
.virtual_file_io_engine
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
})
}
}
@@ -940,9 +920,6 @@ impl PageServerConf {
builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
},
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
"virtual_file_io_engine" => {
builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -1016,7 +993,6 @@ impl PageServerConf {
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
}
}
}
@@ -1249,7 +1225,6 @@ background_task_maximum_delay = '334 s'
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
},
"Correct defaults should be used when no config values are provided"
);
@@ -1313,7 +1288,6 @@ background_task_maximum_delay = '334 s'
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: 100,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -97,86 +97,23 @@ pub enum EvictionOrder {
/// Order the layers to be evicted by how recently they have been accessed relatively within
/// the set of resident layers of a tenant.
///
/// This strategy will evict layers more fairly but is untested.
RelativeAccessed {
/// Determines if the tenant with most layers should lose first.
///
/// Having this enabled is currently the only reasonable option, because the order in which
/// we read tenants is deterministic. If we find the need to use this as `false`, we need
/// to ensure nondeterminism by adding in a random number to break the
/// `relative_last_activity==0.0` ties.
#[serde(default = "default_highest_layer_count_loses_first")]
#[serde(default)]
highest_layer_count_loses_first: bool,
},
}
fn default_highest_layer_count_loses_first() -> bool {
true
}
impl EvictionOrder {
fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
use EvictionOrder::*;
/// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
/// counts should be the first ones to have their layers evicted.
fn highest_layer_count_loses_first(&self) -> bool {
match self {
AbsoluteAccessed => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.last_activity_ts)
});
}
RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
}),
}
}
/// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
/// layers in **most** recently used order.
fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
use EvictionOrder::*;
match self {
AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
RelativeAccessed {
EvictionOrder::AbsoluteAccessed => false,
EvictionOrder::RelativeAccessed {
highest_layer_count_loses_first,
} => {
// keeping the -1 or not decides if every tenant should lose their least recently accessed
// layer OR if this should happen in the order of having highest layer count:
let fudge = if *highest_layer_count_loses_first {
// relative_last_activity vs. tenant layer count:
// - 0.1..=1.0 (10 layers)
// - 0.01..=1.0 (100 layers)
// - 0.001..=1.0 (1000 layers)
//
// leading to evicting less of the smallest tenants.
0
} else {
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
// be that less than 10k layer evictions is enough, so we would not need to evict from
// all tenants.
//
// as the tenant ordering is now deterministic this could hit the same tenants
// disproportionetly on multiple invocations. alternative could be to remember how many
// layers did we evict last time from this tenant, and inject that as an additional
// fudge here.
1
};
let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
let divider = total as f32;
// most recently used is always (total - 0) / divider == 1.0
// least recently used depends on the fudge:
// - (total - 1) - (total - 1) / total => 0 / total
// - total - (total - 1) / total => 1 / total
let distance = (total - index) as f32;
finite_f32::FiniteF32::try_from_normalized(distance / divider)
.unwrap_or_else(|val| {
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
finite_f32::FiniteF32::ZERO
})
}
} => *highest_layer_count_loses_first,
}
}
}
@@ -452,6 +389,52 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
let selection = select_victims(&candidates, usage_pre);
let mut candidates = candidates;
let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
// we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
// for comparison here. this is a temporary measure to develop alternatives.
use std::fmt::Write;
let mut summary_buf = String::with_capacity(256);
{
let absolute_summary = candidates
.iter()
.take(selection.amount)
.map(|(_, candidate)| candidate)
.collect::<summary::EvictionSummary>();
write!(summary_buf, "{absolute_summary}").expect("string grows");
info!("absolute accessed selection summary: {summary_buf}");
}
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
});
let selection = select_victims(&candidates, usage_pre);
{
summary_buf.clear();
let relative_summary = candidates
.iter()
.take(selection.amount)
.map(|(_, candidate)| candidate)
.collect::<summary::EvictionSummary>();
write!(summary_buf, "{relative_summary}").expect("string grows");
info!("relative accessed selection summary: {summary_buf}");
}
selection
} else {
selection
};
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
// phase2: evict layers
@@ -852,12 +835,54 @@ async fn collect_eviction_candidates(
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
let mut cumsum: i128 = 0;
let total = tenant_candidates.len();
// keeping the -1 or not decides if every tenant should lose their least recently accessed
// layer OR if this should happen in the order of having highest layer count:
let fudge = if eviction_order.highest_layer_count_loses_first() {
// relative_age vs. tenant layer count:
// - 0.1..=1.0 (10 layers)
// - 0.01..=1.0 (100 layers)
// - 0.001..=1.0 (1000 layers)
//
// leading to evicting less of the smallest tenants.
0
} else {
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
// be that less than 10k layer evictions is enough, so we would not need to evict from
// all tenants.
//
// as the tenant ordering is now deterministic this could hit the same tenants
// disproportionetly on multiple invocations. alternative could be to remember how many
// layers did we evict last time from this tenant, and inject that as an additional
// fudge here.
1
};
let total = tenant_candidates
.len()
.checked_sub(fudge)
.filter(|&x| x > 0)
// support 0 or 1 resident layer tenants as well
.unwrap_or(1);
let divider = total as f32;
for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
// as we iterate this reverse sorted list, the most recently accessed layer will always
// be 1.0; this is for us to evict it last.
candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
candidate.relative_last_activity = if matches!(
eviction_order,
EvictionOrder::RelativeAccessed { .. }
) {
// another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
// similarly for u16. unsure how it would help.
finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
.unwrap_or_else(|val| {
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
finite_f32::FiniteF32::ZERO
})
} else {
finite_f32::FiniteF32::ZERO
};
let partition = if cumsum > min_resident_size as i128 {
MinResidentSizePartition::Above
@@ -902,7 +927,10 @@ async fn collect_eviction_candidates(
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
eviction_order.sort(&mut candidates);
// always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
// will sort later by candidate.relative_last_activity to get compare evictions.
candidates
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
Ok(EvictionCandidates::Finished(candidates))
}
@@ -1042,12 +1070,6 @@ pub(crate) mod finite_f32 {
}
}
impl From<FiniteF32> for f32 {
fn from(value: FiniteF32) -> f32 {
value.0
}
}
impl FiniteF32 {
pub const ZERO: FiniteF32 = FiniteF32(0.0);
@@ -1060,9 +1082,136 @@ pub(crate) mod finite_f32 {
Err(value)
}
}
}
}
pub fn into_inner(self) -> f32 {
self.into()
mod summary {
use super::finite_f32::FiniteF32;
use super::{EvictionCandidate, LayerCount};
use pageserver_api::shard::TenantShardId;
use std::collections::{BTreeMap, HashMap};
use std::time::SystemTime;
#[derive(Debug, Default)]
pub(super) struct EvictionSummary {
evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
total: LayerCount,
last_absolute: Option<SystemTime>,
last_relative: Option<FiniteF32>,
}
impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
let mut summary = EvictionSummary::default();
for item in iter {
let counts = summary
.evicted_per_tenant
.entry(*item.layer.get_tenant_shard_id())
.or_default();
let sz = item.layer.get_file_size();
counts.file_sizes += sz;
counts.count += 1;
summary.total.file_sizes += sz;
summary.total.count += 1;
summary.last_absolute = Some(item.last_activity_ts);
summary.last_relative = Some(item.relative_last_activity);
}
summary
}
}
struct SiBytesAmount(u64);
impl std::fmt::Display for SiBytesAmount {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.0 < 1024 {
return write!(f, "{}B", self.0);
}
let mut tmp = self.0;
let mut ch = 0;
let suffixes = b"KMGTPE";
while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
tmp /= 1024;
ch += 1;
}
let ch = suffixes[ch] as char;
write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
}
}
impl std::fmt::Display for EvictionSummary {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// wasteful, but it's for testing
let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
for (tenant_shard_id, count) in &self.evicted_per_tenant {
sorted
.entry(count.count)
.or_default()
.push((*tenant_shard_id, count.file_sizes));
}
let total_file_sizes = SiBytesAmount(self.total.file_sizes);
writeln!(
f,
"selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
self.total.count, self.last_absolute, self.last_relative,
)?;
for (count, per_tenant) in sorted.iter().rev().take(10) {
write!(f, "- {count} layers: ")?;
if per_tenant.len() < 3 {
for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
let bytes = SiBytesAmount(*bytes);
write!(f, "{tenant_shard_id} ({bytes})")?;
}
} else {
let num_tenants = per_tenant.len();
let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
let total_bytes = SiBytesAmount(total_bytes);
let layers = num_tenants * count;
write!(
f,
"{num_tenants} tenants {total_bytes} in total {layers} layers",
)?;
}
writeln!(f)?;
}
if sorted.len() > 10 {
let (rem_count, rem_bytes) = sorted
.iter()
.rev()
.map(|(count, per_tenant)| {
(
count,
per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
)
})
.fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
let rem_bytes = SiBytesAmount(rem_bytes);
writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
}
Ok(())
}
}
}
@@ -1187,40 +1336,3 @@ mod filesystem_level_usage {
assert!(!usage.has_pressure());
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn relative_equal_bounds() {
let order = EvictionOrder::RelativeAccessed {
highest_layer_count_loses_first: false,
};
let len = 10;
let v = (0..len)
.map(|i| order.relative_last_activity(len, i).into_inner())
.collect::<Vec<_>>();
assert_eq!(v.first(), Some(&1.0));
assert_eq!(v.last(), Some(&0.0));
assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
}
#[test]
fn relative_spare_bounds() {
let order = EvictionOrder::RelativeAccessed {
highest_layer_count_loses_first: true,
};
let len = 10;
let v = (0..len)
.map(|i| order.relative_last_activity(len, i).into_inner())
.collect::<Vec<_>>();
assert_eq!(v.first(), Some(&1.0));
assert_eq!(v.last(), Some(&0.1));
assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
}
}

View File

@@ -187,7 +187,6 @@ impl From<TenantSlotUpsertError> for ApiError {
match e {
InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
MapState(e) => e.into(),
ShuttingDown(_) => ApiError::ShuttingDown,
}
}
}
@@ -496,10 +495,6 @@ async fn timeline_create_handler(
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
}
Err(_) if tenant.cancel.is_cancelled() => {
// In case we get some ugly error type during shutdown, cast it into a clean 503.
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
}
Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
json_response(StatusCode::CONFLICT, ())
}
@@ -1262,9 +1257,19 @@ async fn tenant_create_handler(
};
// We created the tenant. Existing API semantics are that the tenant
// is Active when this function returns.
new_tenant
if let res @ Err(_) = new_tenant
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
.await?;
.await
{
// This shouldn't happen because we just created the tenant directory
// in upsert_location, and there aren't any remote timelines
// to load, so, nothing can really fail during load.
// Don't do cleanup because we don't know how we got here.
// The tenant will likely be in `Broken` state and subsequent
// calls will fail.
res.context("created tenant failed to become active")
.map_err(ApiError::InternalServerError)?;
}
json_response(
StatusCode::CREATED,

View File

@@ -1,4 +1,3 @@
#![recursion_limit = "300"]
#![deny(clippy::undocumented_unsafe_blocks)]
mod auth;

View File

@@ -150,43 +150,6 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) struct GetVectoredLatency {
map: EnumMap<TaskKind, Option<Histogram>>,
}
impl GetVectoredLatency {
// Only these task types perform vectored gets. Filter all other tasks out to reduce total
// cardinality of the metric.
const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
self.map[task_kind].as_ref()
}
}
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
let inner = register_histogram_vec!(
"pageserver_get_vectored_seconds",
"Time spent in get_vectored",
&["task_kind"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric");
GetVectoredLatency {
map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
let task_kind = task_kind.into();
Some(inner.with_label_values(&[task_kind]))
} else {
None
}
})),
}
});
pub(crate) struct PageCacheMetricsForTaskKind {
pub read_accesses_materialized_page: IntCounter,
pub read_accesses_immutable: IntCounter,
@@ -969,7 +932,6 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
#[cfg(not(test))]
pub(crate) mod virtual_file_descriptor_cache {
use super::*;
@@ -989,20 +951,6 @@ pub(crate) mod virtual_file_descriptor_cache {
// ```
}
#[cfg(not(test))]
pub(crate) mod virtual_file_io_engine {
use super::*;
pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_virtual_file_io_engine_kind",
"The configured io engine for VirtualFile",
&["kind"],
)
.unwrap()
});
}
#[derive(Debug)]
struct GlobalAndPerTimelineHistogram {
global: Histogram,

View File

@@ -322,8 +322,8 @@ enum PageStreamError {
Shutdown,
/// Something went wrong reading a page: this likely indicates a pageserver bug
#[error("Read error")]
Read(#[source] PageReconstructError),
#[error("Read error: {0}")]
Read(PageReconstructError),
/// Ran out of time waiting for an LSN
#[error("LSN timeout: {0}")]
@@ -332,11 +332,11 @@ enum PageStreamError {
/// The entity required to serve the request (tenant or timeline) is not found,
/// or is not found in a suitable state to serve a request.
#[error("Not found: {0}")]
NotFound(Cow<'static, str>),
NotFound(std::borrow::Cow<'static, str>),
/// Request asked for something that doesn't make sense, like an invalid LSN
#[error("Bad request: {0}")]
BadRequest(Cow<'static, str>),
BadRequest(std::borrow::Cow<'static, str>),
}
impl From<PageReconstructError> for PageStreamError {
@@ -667,10 +667,7 @@ impl PageServerHandler {
// print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
let full = utils::error::report_compact_sources(&e);
span.in_scope(|| {
error!("error reading relation or page version: {full:#}")
});
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})

View File

@@ -627,15 +627,9 @@ impl Tenant {
deletion_queue_client,
));
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
// we shut down while attaching.
let Ok(attach_gate_guard) = tenant.gate.enter() else {
// We just created the Tenant: nothing else can have shut it down yet
unreachable!();
};
// Do all the hard work in the background
let tenant_clone = Arc::clone(&tenant);
let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
task_mgr::spawn(
&tokio::runtime::Handle::current(),
@@ -645,8 +639,6 @@ impl Tenant {
"attach tenant",
false,
async move {
let _gate_guard = attach_gate_guard;
// Is this tenant being spawned as part of process startup?
let starting_up = init_order.is_some();
scopeguard::defer! {
@@ -821,7 +813,7 @@ impl Tenant {
SpawnMode::Create => None,
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
};
match tenant_clone.attach(preload, mode, &ctx).await {
match tenant_clone.attach(preload, &ctx).await {
Ok(()) => {
info!("attach finished, activating");
if let Some(t)= attach_timer {t.observe_duration();}
@@ -908,20 +900,15 @@ impl Tenant {
async fn attach(
self: &Arc<Tenant>,
preload: Option<TenantPreload>,
mode: SpawnMode,
ctx: &RequestContext,
) -> anyhow::Result<()> {
span::debug_assert_current_span_has_tenant_id();
failpoint_support::sleep_millis_async!("before-attaching-tenant");
let preload = match (preload, mode) {
(Some(p), _) => p,
(None, SpawnMode::Create) => TenantPreload {
deleting: false,
timelines: HashMap::new(),
},
(None, SpawnMode::Normal) => {
let preload = match preload {
Some(p) => p,
None => {
// Deprecated dev mode: load from local disk state instead of remote storage
// https://github.com/neondatabase/neon/issues/5624
return self.load_local(ctx).await;
@@ -1696,13 +1683,9 @@ impl Tenant {
ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
if !self.is_active() {
if matches!(self.current_state(), TenantState::Stopping { .. }) {
return Err(CreateTimelineError::ShuttingDown);
} else {
return Err(CreateTimelineError::Other(anyhow::anyhow!(
"Cannot create timelines on inactive tenant"
)));
}
return Err(CreateTimelineError::Other(anyhow::anyhow!(
"Cannot create timelines on inactive tenant"
)));
}
let _gate = self
@@ -4052,7 +4035,7 @@ pub(crate) mod harness {
.instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
.await?;
tenant
.attach(Some(preload), SpawnMode::Normal, ctx)
.attach(Some(preload), ctx)
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
.await?;
}
@@ -4130,11 +4113,13 @@ mod tests {
use crate::keyspace::KeySpaceAccum;
use crate::repository::{Key, Value};
use crate::tenant::harness::*;
use crate::tenant::timeline::GetVectoredImpl;
use crate::DEFAULT_PG_VERSION;
use crate::METADATA_FILE_NAME;
use bytes::BytesMut;
use hex_literal::hex;
use once_cell::sync::Lazy;
use pageserver_api::keyspace::KeySpace;
use rand::{thread_rng, Rng};
use tokio_util::sync::CancellationToken;
@@ -4827,6 +4812,61 @@ mod tests {
Ok(())
}
async fn bulk_insert_compact_gc(
timeline: Arc<Timeline>,
ctx: &RequestContext,
mut lsn: Lsn,
repeat: usize,
key_count: usize,
) -> anyhow::Result<()> {
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let mut blknum = 0;
// Enforce that key range is monotonously increasing
let mut keyspace = KeySpaceAccum::new();
for _ in 0..repeat {
for _ in 0..key_count {
test_key.field6 = blknum;
let writer = timeline.writer().await;
writer
.put(
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
keyspace.add_key(test_key);
lsn = Lsn(lsn.0 + 0x10);
blknum += 1;
}
let cutoff = timeline.get_last_record_lsn();
timeline
.update_gc_info(
Vec::new(),
cutoff,
Duration::ZERO,
&CancellationToken::new(),
ctx,
)
.await?;
timeline.freeze_and_flush().await?;
timeline
.compact(&CancellationToken::new(), EnumSet::empty(), ctx)
.await?;
timeline.gc().await?;
}
Ok(())
}
//
// Insert 1000 key-value pairs with increasing keys, flush, compact, GC.
// Repeat 50 times.
@@ -4839,49 +4879,126 @@ mod tests {
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
.await?;
let mut lsn = Lsn(0x10);
let lsn = Lsn(0x10);
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
let mut keyspace = KeySpaceAccum::new();
let guard = tline.layers.read().await;
guard.layer_map().dump(true, &ctx).await?;
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let mut blknum = 0;
for _ in 0..50 {
for _ in 0..10000 {
test_key.field6 = blknum;
let writer = tline.writer().await;
writer
.put(
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
drop(writer);
Ok(())
}
keyspace.add_key(test_key);
// Test the vectored get real implementation against a simple sequential implementation.
//
// The test generates a keyspace by repeatedly flushing the in-memory layer and compacting.
// Projected to 2D the key space looks like below. Lsn grows upwards on the Y axis and keys
// grow to the right on the X axis.
// [Delta]
// [Delta]
// [Delta]
// [Delta]
// ------------ Image ---------------
//
// After layer generation we pick the ranges to query as follows:
// 1. The beginning of each delta layer
// 2. At the seam between two adjacent delta layers
//
// There's one major downside to this test: delta layers only contains images,
// so the search can stop at the first delta layer and doesn't traverse any deeper.
#[tokio::test]
async fn test_get_vectored() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_bulk_insert")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
.await?;
lsn = Lsn(lsn.0 + 0x10);
blknum += 1;
let lsn = Lsn(0x10);
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
let guard = tline.layers.read().await;
guard.layer_map().dump(true, &ctx).await?;
let mut reads = Vec::new();
let mut prev = None;
guard.layer_map().iter_historic_layers().for_each(|desc| {
if !desc.is_delta() {
prev = Some(desc.clone());
return;
}
let cutoff = tline.get_last_record_lsn();
let start = desc.key_range.start;
let end = desc
.key_range
.start
.add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap());
reads.push(KeySpace {
ranges: vec![start..end],
});
tline
.update_gc_info(
Vec::new(),
cutoff,
Duration::ZERO,
&CancellationToken::new(),
&ctx,
)
.await?;
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.gc().await?;
if let Some(prev) = &prev {
if !prev.is_delta() {
return;
}
let first_range = Key {
field6: prev.key_range.end.field6 - 4,
..prev.key_range.end
}..prev.key_range.end;
let second_range = desc.key_range.start..Key {
field6: desc.key_range.start.field6 + 4,
..desc.key_range.start
};
reads.push(KeySpace {
ranges: vec![first_range, second_range],
});
};
prev = Some(desc.clone());
});
drop(guard);
// Pick a big LSN such that we query over all the changes.
// Technically, u64::MAX - 1 is the largest LSN supported by the read path,
// but there seems to be a bug on the non-vectored search path which surfaces
// in that case.
let reads_lsn = Lsn(u64::MAX - 1000);
for read in reads {
info!("Doing vectored read on {:?}", read);
tline.set_get_vectored_impl(GetVectoredImpl::Sequential);
let sequential_res = tline
.get_vectored(&read.ranges, reads_lsn, &ctx)
.await?
.into_iter();
tline.set_get_vectored_impl(GetVectoredImpl::Vectored);
let vectored_res = tline
.get_vectored(&read.ranges, reads_lsn, &ctx)
.await?
.into_iter();
sequential_res.zip(vectored_res).for_each(
|((seq_key, seq_res), (vec_key, vec_res))| {
assert_eq!(seq_key, vec_key);
match (&seq_res, &vec_res) {
(Ok(seq_blob), Ok(vec_blob)) => assert_eq!(seq_blob, vec_blob),
_ => {
error!(
"Unexpected error for key={}: seq_res={:?} vec_res={:?}",
seq_key,
seq_res.map(|_| "Ok(...)"),
vec_res.map(|_| "Ok(...)")
);
panic!()
}
}
},
);
}
Ok(())

View File

@@ -5,10 +5,10 @@
use super::ephemeral_file::EphemeralFile;
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
use crate::context::RequestContext;
use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
use crate::virtual_file::VirtualFile;
use bytes::Bytes;
use std::ops::Deref;
use std::ops::{Deref, DerefMut};
/// This is implemented by anything that can read 8 kB (PAGE_SZ)
/// blocks, using the page cache
@@ -39,8 +39,6 @@ pub enum BlockLease<'a> {
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
#[cfg(test)]
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
#[cfg(test)]
Vec(Vec<u8>),
}
impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -65,10 +63,6 @@ impl<'a> Deref for BlockLease<'a> {
BlockLease::EphemeralFileMutableTail(v) => v,
#[cfg(test)]
BlockLease::Arc(v) => v.deref(),
#[cfg(test)]
BlockLease::Vec(v) => {
TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
}
}
}
}
@@ -175,14 +169,10 @@ impl FileBlockReader {
}
/// Read a page from the underlying file into given buffer.
async fn fill_buffer(
&self,
buf: PageWriteGuard<'static>,
blkno: u32,
) -> Result<PageWriteGuard<'static>, std::io::Error> {
async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
assert!(buf.len() == PAGE_SZ);
self.file
.read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
.await
}
/// Read a block.
@@ -206,9 +196,9 @@ impl FileBlockReader {
)
})? {
ReadBufResult::Found(guard) => Ok(guard.into()),
ReadBufResult::NotFound(write_guard) => {
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
let write_guard = self.fill_buffer(write_guard, blknum).await?;
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
Ok(write_guard.mark_valid().into())
}
}

View File

@@ -409,10 +409,7 @@ impl DeleteTenantFlow {
.await
.expect("cant be stopping or broken");
tenant
.attach(preload, super::SpawnMode::Normal, ctx)
.await
.context("attach")?;
tenant.attach(preload, ctx).await.context("attach")?;
Self::background(
guard,

View File

@@ -5,11 +5,11 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::{self, VirtualFile};
use crate::virtual_file::VirtualFile;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use std::cmp::min;
use std::fs::OpenOptions;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::sync::atomic::AtomicU64;
@@ -47,10 +47,7 @@ impl EphemeralFile {
let file = VirtualFile::open_with_options(
&filename,
virtual_file::OpenOptions::new()
.read(true)
.write(true)
.create(true),
OpenOptions::new().read(true).write(true).create(true),
)
.await?;
@@ -92,10 +89,11 @@ impl EphemeralFile {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(write_guard) => {
let write_guard = self
.file
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
page_cache::ReadBufResult::NotFound(mut write_guard) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
self.file
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
.await?;
let read_guard = write_guard.mark_valid();
return Ok(BlockLease::PageReadGuard(read_guard));

View File

@@ -153,7 +153,8 @@ pub struct SearchResult {
pub lsn_floor: Lsn,
}
pub struct OrderedSearchResult(SearchResult);
#[derive(Debug)]
pub struct OrderedSearchResult(pub SearchResult);
impl Ord for OrderedSearchResult {
fn cmp(&self, other: &Self) -> Ordering {
@@ -175,6 +176,7 @@ impl PartialEq for OrderedSearchResult {
impl Eq for OrderedSearchResult {}
#[derive(Debug)]
pub struct RangeSearchResult {
pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
pub not_found: KeySpaceAccum,
@@ -237,7 +239,7 @@ where
Self {
delta_coverage: delta_coverage.peekable(),
image_coverage: image_coverage.peekable(),
key_range,
key_range: key_range.clone(),
end_lsn,
current_delta: None,
current_image: None,
@@ -247,7 +249,7 @@ where
/// Run the collector. Collection is implemented via a two pointer algorithm.
/// One pointer tracks the start of the current range and the other tracks
/// the beginning of the next range which will overlap with the next change
/// the beggining of the next range which will overlap with the next change
/// in coverage across both image and delta.
fn collect(mut self) -> RangeSearchResult {
let next_layer_type = self.choose_next_layer_type();
@@ -362,6 +364,21 @@ where
}
}
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
pub enum InMemoryLayerHandle {
Open { lsn_floor: Lsn },
Frozen { idx: usize, lsn_floor: Lsn },
}
impl InMemoryLayerHandle {
pub fn get_lsn_floor(&self) -> Lsn {
match self {
InMemoryLayerHandle::Open { lsn_floor } => *lsn_floor,
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
}
}
}
impl LayerMap {
///
/// Find the latest layer (by lsn.end) that covers the given
@@ -556,6 +573,47 @@ impl LayerMap {
self.historic.iter()
}
pub fn iter_in_memory_layers(&self) -> impl '_ + Iterator<Item = Arc<InMemoryLayer>> {
match &self.open_layer {
Some(layer) => itertools::Either::Left(
std::iter::once(layer.clone()).chain(self.frozen_layers.iter().cloned().rev()),
),
None => itertools::Either::Right(self.frozen_layers.iter().cloned().rev()),
}
}
// TODO:
// * define in memory handle
// * update layer fringe to use PersistentLayerDesc and InMemoryLayerHandle
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
where
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
{
if let Some(open) = &self.open_layer {
if pred(open) {
return Some(InMemoryLayerHandle::Open {
lsn_floor: open.get_lsn_range().start,
});
}
}
let pos = self.frozen_layers.iter().position(pred);
match pos {
Some(idx) => Some(InMemoryLayerHandle::Frozen {
idx,
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
}),
None => None,
}
}
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
match handle {
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
}
}
///
/// Divide the whole given range of keys into sub-ranges based on the latest
/// image layer that covers each range at the specified lsn (inclusive).
@@ -957,7 +1015,7 @@ mod tests {
},
LayerDesc {
key_range: Key::from_i128(10)..Key::from_i128(20),
lsn_range: Lsn(5)..Lsn(20),
lsn_range: Lsn(10)..Lsn(20),
is_delta: true,
},
LayerDesc {
@@ -988,4 +1046,41 @@ mod tests {
}
}
}
#[test]
fn ranged_search_compacted() {
let layers = vec![
LayerDesc {
key_range: Key::from_i128(0)..Key::from_i128(100),
lsn_range: Lsn(9)..Lsn(10),
is_delta: false,
},
LayerDesc {
key_range: Key::from_i128(10)..Key::from_i128(20),
lsn_range: Lsn(10)..Lsn(20),
is_delta: true,
},
LayerDesc {
key_range: Key::from_i128(20)..Key::from_i128(30),
lsn_range: Lsn(20)..Lsn(30),
is_delta: true,
},
LayerDesc {
key_range: Key::from_i128(30)..Key::from_i128(40),
lsn_range: Lsn(30)..Lsn(40),
is_delta: true,
},
];
let layer_map = create_layer_map(layers.clone());
for start in 0..105 {
for end in (start + 1)..105 {
let range = Key::from_i128(start)..Key::from_i128(end);
let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
let expected = brute_force_range_search(&layer_map, range, Lsn(100));
assert_range_search_result_eq(result, expected);
}
}
}
}

View File

@@ -150,7 +150,7 @@ impl<Value: Clone> LayerCoverage<Value> {
let mut range_coverage = self.range(range).peekable();
if range_coverage
.peek()
.is_some_and(|c| c.1.as_ref() == Some(&change))
.is_some_and(|c| c.1 == Some(change.clone()))
{
range_coverage.next();
}

View File

@@ -7,7 +7,6 @@ use pageserver_api::models::ShardParameters;
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
use rand::{distributions::Alphanumeric, Rng};
use std::borrow::Cow;
use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap};
use std::ops::Deref;
use std::sync::Arc;
@@ -33,8 +32,7 @@ use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
TenantConfOpt,
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
};
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
@@ -468,26 +466,6 @@ pub async fn init_tenant_mgr(
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
if let Some(gen) = generations.get(&tenant_shard_id) {
if let LocationMode::Attached(attached) = &location_conf.mode {
if attached.generation > *gen {
tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
attached.generation
);
// We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
// local disk content: demote to secondary rather than detaching.
tenants.insert(
tenant_shard_id,
TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
location_conf.shard,
location_conf.tenant_conf,
&SecondaryLocationConfig { warm: false },
)),
);
}
}
*gen
} else {
match &location_conf.mode {
@@ -743,7 +721,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
tokio::select! {
Some(joined) = join_set.join_next() => {
match joined {
Ok(()) => {},
Ok(()) => {}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the tasks");
}
@@ -904,7 +882,7 @@ impl TenantManager {
tenant_shard_id: TenantShardId,
new_location_config: LocationConf,
flush: Option<Duration>,
mut spawn_mode: SpawnMode,
spawn_mode: SpawnMode,
ctx: &RequestContext,
) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
debug_assert_current_span_has_tenant_id();
@@ -924,29 +902,19 @@ impl TenantManager {
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
match (&new_location_config.mode, peek_slot) {
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
match attach_conf.generation.cmp(&tenant.generation) {
Ordering::Equal => {
// A transition from Attached to Attached in the same generation, we may
// take our fast path and just provide the updated configuration
// to the tenant.
tenant.set_new_location_config(
AttachedTenantConf::try_from(new_location_config.clone())
.map_err(UpsertLocationError::BadRequest)?,
);
if attach_conf.generation == tenant.generation {
// A transition from Attached to Attached in the same generation, we may
// take our fast path and just provide the updated configuration
// to the tenant.
tenant.set_new_location_config(
AttachedTenantConf::try_from(new_location_config.clone())
.map_err(UpsertLocationError::BadRequest)?,
);
Some(FastPathModified::Attached(tenant.clone()))
}
Ordering::Less => {
return Err(UpsertLocationError::BadRequest(anyhow::anyhow!(
"Generation {:?} is less than existing {:?}",
attach_conf.generation,
tenant.generation
)));
}
Ordering::Greater => {
// Generation advanced, fall through to general case of replacing `Tenant` object
None
}
Some(FastPathModified::Attached(tenant.clone()))
} else {
// Different generations, fall through to general case
None
}
}
(
@@ -1051,12 +1019,6 @@ impl TenantManager {
}
}
slot_guard.drop_old_value().expect("We just shut it down");
// Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
// the caller thinks they're creating but the tenant already existed. We must switch to
// Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
// rather than assuming it to be empty.
spawn_mode = SpawnMode::Normal;
}
Some(TenantSlot::Secondary(state)) => {
info!("Shutting down secondary tenant");
@@ -1140,46 +1102,14 @@ impl TenantManager {
None
};
match slot_guard.upsert(new_slot) {
Err(TenantSlotUpsertError::InternalError(e)) => {
Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
slot_guard.upsert(new_slot).map_err(|e| match e {
TenantSlotUpsertError::InternalError(e) => {
UpsertLocationError::Other(anyhow::anyhow!(e))
}
Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
// If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then
// we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants
// are shutdown.
//
// We must shut it down inline here.
match new_slot {
TenantSlot::InProgress(_) => {
// Unreachable because we never insert an InProgress
unreachable!()
}
TenantSlot::Attached(tenant) => {
let (_guard, progress) = utils::completion::channel();
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
match tenant.shutdown(progress, false).await {
Ok(()) => {
info!("Finished shutting down just-spawned tenant");
}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
}
}
}
TenantSlot::Secondary(secondary_tenant) => {
secondary_tenant.shutdown().await;
}
}
TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
})?;
Err(UpsertLocationError::Unavailable(
TenantMapError::ShuttingDown,
))
}
Ok(()) => Ok(attached_tenant),
}
Ok(attached_tenant)
}
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
@@ -1798,31 +1728,14 @@ pub(crate) enum TenantSlotError {
/// Superset of TenantMapError: issues that can occur when using a SlotGuard
/// to insert a new value.
#[derive(thiserror::Error)]
pub(crate) enum TenantSlotUpsertError {
#[derive(Debug, thiserror::Error)]
pub enum TenantSlotUpsertError {
/// An error where the slot is in an unexpected state, indicating a code bug
#[error("Internal error updating Tenant")]
InternalError(Cow<'static, str>),
#[error(transparent)]
MapState(TenantMapError),
// If we encounter TenantManager shutdown during upsert, we must carry the Completion
// from the SlotGuard, so that the caller can hold it while they clean up: otherwise
// TenantManager shutdown might race ahead before we're done cleaning up any Tenant that
// was protected by the SlotGuard.
#[error("Shutting down")]
ShuttingDown((TenantSlot, utils::completion::Completion)),
}
impl std::fmt::Debug for TenantSlotUpsertError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::InternalError(reason) => write!(f, "Internal Error {reason}"),
Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"),
Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"),
}
}
MapState(#[from] TenantMapError),
}
#[derive(Debug, thiserror::Error)]
@@ -1871,7 +1784,7 @@ pub struct SlotGuard {
/// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will
/// release any waiters as soon as this SlotGuard is dropped.
completion: utils::completion::Completion,
_completion: utils::completion::Completion,
}
impl SlotGuard {
@@ -1884,7 +1797,7 @@ impl SlotGuard {
tenant_shard_id,
old_value,
upserted: false,
completion,
_completion: completion,
}
}
@@ -1917,16 +1830,9 @@ impl SlotGuard {
}
let m = match &mut *locked {
TenantsMap::Initializing => {
return Err(TenantSlotUpsertError::MapState(
TenantMapError::StillInitializing,
))
}
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
TenantsMap::ShuttingDown(_) => {
return Err(TenantSlotUpsertError::ShuttingDown((
new_value,
self.completion.clone(),
)));
return Err(TenantMapError::ShuttingDown.into());
}
TenantsMap::Open(m) => m,
};
@@ -1974,9 +1880,7 @@ impl SlotGuard {
Err(TenantSlotUpsertError::InternalError(_)) => {
// We already logged the error, nothing else we can do.
}
Err(
TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_),
) => {
Err(TenantSlotUpsertError::MapState(_)) => {
// If the map is shutting down, we need not replace anything
}
Ok(()) => {}
@@ -2074,22 +1978,18 @@ fn tenant_map_peek_slot<'a>(
tenant_shard_id: &TenantShardId,
mode: TenantSlotPeekMode,
) -> Result<Option<&'a TenantSlot>, TenantMapError> {
match tenants.deref() {
TenantsMap::Initializing => Err(TenantMapError::StillInitializing),
let m = match tenants.deref() {
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing),
TenantsMap::ShuttingDown(m) => match mode {
TenantSlotPeekMode::Read => Ok(Some(
// When reading in ShuttingDown state, we must translate None results
// into a ShuttingDown error, because absence of a tenant shard ID in the map
// isn't a reliable indicator of the tenant being gone: it might have been
// InProgress when shutdown started, and cleaned up from that state such
// that it's now no longer in the map. Callers will have to wait until
// we next start up to get a proper answer. This avoids incorrect 404 API responses.
m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?,
)),
TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown),
TenantSlotPeekMode::Read => m,
TenantSlotPeekMode::Write => {
return Err(TenantMapError::ShuttingDown);
}
},
TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)),
}
TenantsMap::Open(m) => m,
};
Ok(m.get(tenant_shard_id))
}
enum TenantSlotAcquireMode {

View File

@@ -8,17 +8,24 @@ pub(crate) mod layer;
mod layer_desc;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value;
use crate::task_mgr::TaskKind;
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
use enum_map::EnumMap;
use enumset::EnumSet;
use once_cell::sync::Lazy;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
use pageserver_api::models::{
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
};
use std::cell::{Ref, RefCell};
use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::Entry;
use std::collections::{BinaryHeap, HashMap};
use std::ops::Range;
use std::sync::Mutex;
use std::sync::{Arc, Mutex};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::warn;
use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -34,6 +41,13 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
use self::layer::DownloadError;
use super::layer_map::{InMemoryLayerHandle, LayerMap};
use super::timeline::layer_manager::LayerManager;
use super::timeline::GetVectoredError;
use super::{PageReconstructError, Timeline};
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
where
T: PartialOrd<T>,
@@ -61,12 +75,219 @@ where
/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
/// call, to collect more records.
///
#[derive(Debug)]
#[derive(Debug, Default)]
pub struct ValueReconstructState {
pub records: Vec<(Lsn, NeonWalRecord)>,
pub img: Option<(Lsn, Bytes)>,
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub enum ValureReconstructSituation {
Complete,
#[default]
Continue,
}
#[derive(Debug, Default, Clone)]
pub struct ValueReconstructStateTmp {
pub records: Vec<(Lsn, NeonWalRecord)>,
pub img: Option<(Lsn, Bytes)>,
cached_lsn: Option<Lsn>,
situation: ValureReconstructSituation,
}
impl From<ValueReconstructStateTmp> for ValueReconstructState {
fn from(mut state: ValueReconstructStateTmp) -> Self {
state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
ValueReconstructState {
records: state.records,
img: state.img
}
}
}
pub struct ValuesReconstructState {
pub keys: HashMap<Key, Result<ValueReconstructStateTmp, PageReconstructError>>,
pub total_keys_done: usize,
keys_done: KeySpaceAccum,
}
impl ValuesReconstructState {
pub fn new() -> Self {
Self {
keys: HashMap::new(),
total_keys_done: 0,
keys_done: KeySpaceAccum::new(),
}
}
pub fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
let previous = self.keys.insert(key, Err(err));
if let Some(Ok(state)) = previous {
if state.situation == ValureReconstructSituation::Continue {
self.total_keys_done += 1;
self.keys_done.add_key(key);
}
}
}
pub fn update_key(&mut self, key: &Key, lsn: Lsn, value: Value) -> bool {
let state = self
.keys
.entry(*key)
.or_insert(Ok(ValueReconstructStateTmp::default()));
if let Ok(state) = state {
let key_done = match state.situation {
ValureReconstructSituation::Complete => true,
ValureReconstructSituation::Continue => match value {
Value::Image(img) => {
state.img = Some((lsn, img));
true
}
Value::WalRecord(rec) => {
let reached_cache = state.cached_lsn.map(|clsn| clsn + 1) == Some(lsn);
let will_init = rec.will_init();
state.records.push((lsn, rec));
will_init || reached_cache
}
},
};
if key_done && state.situation == ValureReconstructSituation::Continue {
state.situation = ValureReconstructSituation::Complete;
self.total_keys_done += 1;
self.keys_done.add_key(*key);
}
key_done
} else {
true
}
}
pub fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
self.keys
.get(key)
.and_then(|k| k.as_ref().ok())
.and_then(|state| state.cached_lsn)
}
pub fn consume_done_keys(&mut self) -> KeySpace {
self.keys_done.consume_keyspace()
}
}
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
pub(crate) enum ReadableLayerDesc {
Persistent {
desc: PersistentLayerDesc,
lsn_floor: Lsn,
},
InMemory(InMemoryLayerHandle),
}
#[derive(Debug)]
struct ReadableLayerDescOrdered(ReadableLayerDesc);
#[derive(Debug)]
pub struct LayerFringe {
layers_by_lsn: BinaryHeap<Reverse<ReadableLayerDescOrdered>>,
layers: HashMap<ReadableLayerDesc, KeySpace>,
}
impl LayerFringe {
pub fn new() -> Self {
LayerFringe {
layers_by_lsn: BinaryHeap::new(),
layers: HashMap::new(),
}
}
pub fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
let handle = match self.layers_by_lsn.pop() {
Some(h) => h,
None => return None,
};
let removed = self.layers.remove_entry(&handle.0 .0);
match removed {
Some((layer, keyspace)) => Some((layer, keyspace)),
None => panic!(),
}
}
pub fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
let entry = self.layers.entry(layer.clone());
match entry {
Entry::Occupied(mut entry) => {
entry.get_mut().merge(&keyspace);
},
Entry::Vacant(entry) => {
self.layers_by_lsn
.push(Reverse(ReadableLayerDescOrdered(entry.key().clone())));
entry.insert(keyspace);
}
}
}
}
impl Ord for ReadableLayerDescOrdered {
fn cmp(&self, other: &Self) -> Ordering {
self.0.get_lsn_floor().cmp(&other.0.get_lsn_floor())
}
}
impl PartialOrd for ReadableLayerDescOrdered {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for ReadableLayerDescOrdered {
fn eq(&self, other: &Self) -> bool {
self.0.get_lsn_floor() == other.0.get_lsn_floor()
}
}
impl Eq for ReadableLayerDescOrdered {}
impl ReadableLayerDesc {
pub(super) fn get_lsn_floor(&self) -> Lsn {
match self {
ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
ReadableLayerDesc::InMemory(l) => l.get_lsn_floor(),
}
}
pub async fn get_values_reconstruct_data(
&self,
layer_manager: &LayerManager,
keyspace: KeySpace,
end_lsn: Lsn,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
match self {
ReadableLayerDesc::Persistent { desc, .. } => {
let layer = layer_manager.get_from_desc(desc);
layer
.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_state, ctx)
.await
}
ReadableLayerDesc::InMemory(desc) => {
let layer = layer_manager.layer_map().get_in_memory_layer(desc).unwrap();
layer
.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_state, ctx)
.await
}
}
}
}
/// Return value from [`Layer::get_value_reconstruct_data`]
#[derive(Clone, Copy, Debug)]
pub enum ValueReconstructResult {

View File

@@ -35,16 +35,19 @@ use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use crate::tenant::Timeline;
use crate::virtual_file::{self, VirtualFile};
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
use crate::virtual_file::VirtualFile;
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{bail, ensure, Context, Result};
use anyhow::{anyhow, bail, ensure, Context, Result};
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, VecDeque};
use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
@@ -59,7 +62,9 @@ use utils::{
lsn::Lsn,
};
use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
use super::{
AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
};
///
/// Header stored in the beginning of the file
@@ -649,7 +654,7 @@ impl DeltaLayer {
{
let file = VirtualFile::open_with_options(
path,
virtual_file::OpenOptions::new().read(true).write(true),
&*std::fs::OpenOptions::new().read(true).write(true),
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
@@ -773,6 +778,7 @@ impl DeltaLayerInner {
let cursor = file.block_cursor();
let mut buf = Vec::new();
for (entry_lsn, pos) in offsets {
info!("Reading sequential from delta layer: key={} lsn={} block={}", key, entry_lsn, pos);
cursor
.read_blob_into_buf(pos, &mut buf, ctx)
.await
@@ -812,6 +818,121 @@ impl DeltaLayerInner {
}
}
pub(super) async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
end_lsn: Lsn,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
// Scan the page versions backwards, starting from `lsn`.
let file = &self.file;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
self.index_start_blk,
self.index_root_blk,
file,
);
let mut offsets: BTreeMap<Key, VecDeque<(Lsn, u64)>> = BTreeMap::new();
for range in keyspace.ranges.iter() {
let mut ignore_key = None;
let last_key = Key::from_i128(range.end.to_i128() - 1);
let last_key = DeltaKey::from_key_lsn(&last_key, Lsn(end_lsn.0 - 1));
tree_reader
.visit(
&last_key.0,
VisitDirection::Backwards,
|raw_key, value| {
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
if key < range.start {
return false;
}
if Some(key) == ignore_key {
return true;
}
if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
if entry_lsn <= cached_lsn {
return key != range.start;
}
}
let blob_ref = BlobRef(value);
let lsns_at = offsets
.entry(key)
.or_default();
lsns_at.push_front((entry_lsn, blob_ref.pos()));
if blob_ref.will_init() {
if key == range.start {
return false;
} else {
ignore_key = Some(key);
return true;
}
}
true
},
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
.build(),
)
.await
.map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
}
let ctx = &RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerValue)
.build();
// TODO: coalesce reads
let cursor = file.block_cursor();
let mut buf = Vec::new();
for (key, lsns_at) in offsets {
for (lsn, block_offset) in lsns_at {
info!("Reading vectored from delta layer: key={} lsn={} block={}", key, lsn, block_offset);
let res = cursor
.read_blob_into_buf(block_offset, &mut buf, ctx)
.await;
if let Err(e) = res {
reconstruct_state.on_key_error(
key,
PageReconstructError::from(anyhow!(e).context(format!(
"Failed to read blob from virtual file {}",
file.file.path
))),
);
break;
}
let value = Value::des(&buf);
if let Err(e) = value {
reconstruct_state.on_key_error(
key,
PageReconstructError::from(anyhow!(e).context(format!(
"Failed to deserialize file blob from virtual file {}",
file.file.path
))),
);
break;
}
reconstruct_state.update_key(&key, lsn, value.unwrap());
}
}
Ok(())
}
pub(super) async fn load_keys<'a>(
&'a self,
ctx: &RequestContext,

View File

@@ -26,20 +26,22 @@
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, KEY_SIZE};
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
};
use crate::tenant::Timeline;
use crate::virtual_file::{self, VirtualFile};
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
use crate::virtual_file::VirtualFile;
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{bail, ensure, Context, Result};
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use hex;
use pageserver_api::keyspace::{key_range_size, KeySpace};
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
@@ -59,7 +61,7 @@ use utils::{
};
use super::filename::ImageFileName;
use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
///
/// Header stored in the beginning of the file
@@ -327,7 +329,7 @@ impl ImageLayer {
{
let file = VirtualFile::open_with_options(
path,
virtual_file::OpenOptions::new().read(true).write(true),
&*std::fs::OpenOptions::new().read(true).write(true),
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
@@ -444,6 +446,108 @@ impl ImageLayerInner {
Ok(ValueReconstructResult::Missing)
}
}
pub(super) async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
// Same deal as for get_values_reconstruct_data in delta_layer.rs.
// It's simple here since we only have keys and no LSNs.
// For each range in the keyspace, visit the btree starting from the start key.
// Move forward while collecting block offsets until we hit the end key.
//
// Once done, issue the reads.
let file = &self.file;
let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
let mut blocks_for_ranges = Vec::new();
for range in keyspace.ranges.iter() {
let mut raw_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
range.start.write_to_byte_slice(&mut raw_key);
let maybe_start_offset = tree_reader
.get(
&raw_key,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
.build(),
)
.await
.map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
match maybe_start_offset {
Some(start_offset) => {
blocks_for_ranges.push((range, start_offset));
}
None => {
return Err(GetVectoredError::MissingKey(range.start));
}
}
// Assert on the fact that the on disk key range has no gaps
// in the interval specified by 'range'.
let range_size = key_range_size(range);
let last_key = range.start.add(range_size - 1);
last_key.write_to_byte_slice(&mut raw_key);
let maybe_end_offset = tree_reader
.get(
&raw_key,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
.build(),
)
.await
.map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
assert!(maybe_end_offset.is_some());
assert_eq!(
maybe_end_offset.unwrap() - maybe_start_offset.unwrap(),
(range_size - 1) as u64
);
}
let ctx = &RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::ImageLayerValue)
.build();
// TODO: coalesce reads
let cursor = file.block_cursor();
let mut buf = Vec::new();
for (range, start_offset) in blocks_for_ranges {
let mut range_offset = 0;
let mut key = range.start;
while key < range.end {
let res = cursor
.read_blob_into_buf(start_offset + range_offset, &mut buf, ctx)
.await;
if let Err(e) = res {
reconstruct_state.on_key_error(
key,
PageReconstructError::from(anyhow!(e).context(format!(
"Failed to read blob from virtual file {}",
file.file.path
))),
);
buf.clear();
break;
}
let blob = Bytes::copy_from_slice(buf.as_slice());
reconstruct_state.update_key(&key, self.lsn, Value::Image(blob));
buf.clear();
range_offset += 1;
key = key.next();
}
}
Ok(())
}
}
/// A builder object for constructing a new image layer.
@@ -492,15 +596,11 @@ impl ImageLayerWriterInner {
},
);
info!("new image layer {path}");
let mut file = {
VirtualFile::open_with_options(
&path,
virtual_file::OpenOptions::new()
.write(true)
.create_new(true),
)
.await?
};
let mut file = VirtualFile::open_with_options(
&path,
std::fs::OpenOptions::new().write(true).create_new(true),
)
.await?;
// make room for the header block
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);

View File

@@ -9,13 +9,16 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::repository::{Key, Value};
use crate::tenant::block_io::BlockReader;
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
use crate::tenant::Timeline;
use crate::tenant::storage_layer::ValueReconstructResult;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
use crate::walrecord;
use anyhow::{ensure, Result};
use anyhow::{anyhow, ensure, Result};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use std::collections::HashMap;
use std::cmp::{Ordering, Reverse};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::sync::{Arc, OnceLock};
use tracing::*;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
@@ -25,7 +28,7 @@ use std::fmt::Write as _;
use std::ops::Range;
use tokio::sync::{RwLock, RwLockWriteGuard};
use super::{DeltaLayerWriter, ResidentLayer};
use super::{DeltaLayerWriter, ResidentLayer, ValueReconstructState, ValuesReconstructState};
pub struct InMemoryLayer {
conf: &'static PageServerConf,
@@ -59,6 +62,7 @@ pub struct InMemoryLayerInner {
/// All versions of all pages in the layer are kept here. Indexed
/// by block number and LSN. The value is an offset into the
/// ephemeral file where the page version is stored.
// IDEA: would be great if this was ordered
index: HashMap<Key, VecMap<Lsn, u64>>,
/// The values are stored in a serialized format in this file.
@@ -67,6 +71,13 @@ pub struct InMemoryLayerInner {
file: EphemeralFile,
}
#[derive(Eq, PartialEq, Ord, PartialOrd)]
struct BlockRead {
key: Key,
lsn: Lsn,
block_offset: u64,
}
impl std::fmt::Debug for InMemoryLayerInner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("InMemoryLayerInner").finish()
@@ -202,6 +213,85 @@ impl InMemoryLayer {
Ok(ValueReconstructResult::Complete)
}
}
pub(crate) async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
end_lsn: Lsn,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
let inner = self.inner.read().await;
let reader = inner.file.block_cursor();
let mut completed_keys = HashSet::new();
let mut min_heap = BinaryHeap::new();
for range in keyspace.ranges.iter() {
let mut key = range.start;
while key < range.end {
if let Some(vec_map) = inner.index.get(&key) {
let range = match reconstruct_state.get_cached_lsn(&key) {
Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
None => self.start_lsn..end_lsn,
};
let slice = vec_map.slice_range(range);
for (entry_lsn, pos) in slice.iter().rev() {
min_heap.push(Reverse(BlockRead {
key,
lsn: *entry_lsn,
block_offset: *pos,
}));
}
}
key = key.next();
}
}
let keyspace_size = keyspace.total_size();
// IDEA: coalesce reads as long as they have sequential block numbers
while completed_keys.len() < keyspace_size && !min_heap.is_empty() {
let block_read = min_heap.pop().unwrap();
if completed_keys.contains(&block_read.0.key) {
continue;
}
let buf = reader.read_blob(block_read.0.block_offset, &ctx).await;
if let Err(e) = buf {
reconstruct_state
.on_key_error(block_read.0.key, PageReconstructError::from(anyhow!(e)));
completed_keys.insert(block_read.0.key);
continue;
}
let value = Value::des(&buf.unwrap());
if let Err(e) = value {
reconstruct_state
.on_key_error(block_read.0.key, PageReconstructError::from(anyhow!(e)));
completed_keys.insert(block_read.0.key);
continue;
}
let key_done = reconstruct_state.update_key(
dbg!(&block_read.0.key),
block_read.0.lsn,
value.unwrap(),
);
if key_done {
completed_keys.insert(block_read.0.key);
}
}
Ok(())
}
}
impl std::fmt::Display for InMemoryLayer {

View File

@@ -1,5 +1,6 @@
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
};
@@ -15,10 +16,11 @@ use utils::sync::heavier_once_cell;
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::repository::Key;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
use super::delta_layer::{self, DeltaEntry};
use super::image_layer;
use super::{image_layer, ValuesReconstructState};
use super::{
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
ValueReconstructResult, ValueReconstructState,
@@ -261,6 +263,40 @@ impl Layer {
.with_context(|| format!("get_value_reconstruct_data for layer {self}"))
}
pub(crate) async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
end_lsn: Lsn,
reconstruct_data: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
// use anyhow::ensure;
let layer = self
.0
.get_or_maybe_download(true, Some(ctx))
.await
.map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
self.0
.access_stats
.record_access(LayerAccessKind::GetValueReconstructData, ctx);
// if self.layer_desc().is_delta {
// ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
// ensure!(self.layer_desc().key_range.contains(&key));
// } else {
// ensure!(self.layer_desc().key_range.contains(&key));
// ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
// ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
// }
layer
.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx)
.instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
.await
}
/// Download the layer if evicted.
///
/// Will not error when the layer is already downloaded.
@@ -1174,7 +1210,7 @@ pub(crate) enum EvictionError {
/// Error internal to the [`LayerInner::get_or_maybe_download`]
#[derive(Debug, thiserror::Error)]
enum DownloadError {
pub(crate) enum DownloadError {
#[error("timeline has already shutdown")]
TimelineShutdown,
#[error("no remote storage configured")]
@@ -1334,6 +1370,28 @@ impl DownloadedLayer {
}
}
async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
end_lsn: Lsn,
reconstruct_data: &mut ValuesReconstructState,
owner: &Arc<LayerInner>,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
use LayerKind::*;
match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
Delta(d) => {
d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx)
.await
}
Image(i) => {
i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
.await
}
}
}
async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
use LayerKind::*;
match self.get(owner, ctx).await? {

View File

@@ -15,7 +15,7 @@ use utils::id::TenantId;
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
/// a unified way to generate layer information like file name.
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
pub struct PersistentLayerDesc {
pub tenant_shard_id: TenantShardId,
pub timeline_id: TimelineId,

View File

@@ -9,7 +9,6 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::TENANT_TASK_EVENTS;
use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::timeline::CompactionError;
use crate::tenant::{Tenant, TenantState};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -182,11 +181,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
log_compaction_error(
&e,
error_run_count,
&wait_duration,
cancel.is_cancelled(),
error!(
"Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
);
wait_duration
} else {
@@ -214,58 +210,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
}
fn log_compaction_error(
e: &CompactionError,
error_run_count: u32,
sleep_duration: &std::time::Duration,
task_cancelled: bool,
) {
use crate::tenant::upload_queue::NotInitialized;
use crate::tenant::PageReconstructError;
use CompactionError::*;
enum LooksLike {
Info,
Error,
}
let decision = match e {
ShuttingDown => None,
_ if task_cancelled => Some(LooksLike::Info),
Other(e) => {
let root_cause = e.root_cause();
let is_stopping = {
let upload_queue = root_cause
.downcast_ref::<NotInitialized>()
.is_some_and(|e| e.is_stopping());
let timeline = root_cause
.downcast_ref::<PageReconstructError>()
.is_some_and(|e| e.is_stopping());
upload_queue || timeline
};
if is_stopping {
Some(LooksLike::Info)
} else {
Some(LooksLike::Error)
}
}
};
match decision {
Some(LooksLike::Info) => info!(
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
),
Some(LooksLike::Error) => error!(
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
),
None => {}
}
}
///
/// GC task's main loop
///

View File

@@ -60,7 +60,7 @@ use crate::{
tenant::storage_layer::{
AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
ValueReconstructState,
ValueReconstructState, ValuesReconstructState,
},
};
use crate::{
@@ -104,11 +104,14 @@ use self::layer_manager::LayerManager;
use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
use super::remote_timeline_client::RemoteTimelineClient;
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{
remote_timeline_client::index::{IndexLayerMetadata, IndexPart},
storage_layer::LayerFringe,
};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(super) enum FlushLoopState {
@@ -159,6 +162,13 @@ pub struct TimelineResources {
pub deletion_queue_client: DeletionQueueClient,
}
#[derive(Debug, Default, Copy, Clone)]
pub(crate) enum GetVectoredImpl {
Sequential,
#[default]
Vectored,
}
pub struct Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
@@ -346,6 +356,8 @@ pub struct Timeline {
///
/// Timeline deletion will acquire both compaction and gc locks in whatever order.
gc_lock: tokio::sync::Mutex<()>,
get_vectored_impl: RwLock<GetVectoredImpl>,
}
pub struct WalReceiverInfo {
@@ -392,7 +404,8 @@ pub(crate) enum PageReconstructError {
#[error("Ancestor LSN wait error: {0}")]
AncestorLsnTimeout(#[from] WaitLsnError),
#[error("timeline shutting down")]
/// The operation was cancelled
#[error("Cancelled")]
Cancelled,
/// The ancestor of this is being stopped
@@ -404,19 +417,6 @@ pub(crate) enum PageReconstructError {
WalRedo(anyhow::Error),
}
impl PageReconstructError {
/// Returns true if this error indicates a tenant/timeline shutdown alike situation
pub(crate) fn is_stopping(&self) -> bool {
use PageReconstructError::*;
match self {
Other(_) => false,
AncestorLsnTimeout(_) => false,
Cancelled | AncestorStopping(_) => true,
WalRedo(_) => false,
}
}
}
#[derive(thiserror::Error, Debug)]
enum CreateImageLayersError {
#[error("timeline shutting down")]
@@ -455,6 +455,30 @@ pub(crate) enum GetVectoredError {
#[error("Requested at invalid LSN: {0}")]
InvalidLsn(Lsn),
#[error("Requested key {0} not found")]
MissingKey(Key),
#[error(transparent)]
GetReadyAncestorError(GetReadyAncestorError),
#[error(transparent)]
Other(#[from] anyhow::Error),
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum GetReadyAncestorError {
#[error("ancestor timeline {0} is being stopped")]
AncestorStopping(TimelineId),
#[error("Ancestor LSN wait error: {0}")]
AncestorLsnTimeout(#[from] WaitLsnError),
#[error("Cancelled")]
Cancelled,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
#[derive(Clone, Copy)]
@@ -535,6 +559,21 @@ impl From<GetVectoredError> for CreateImageLayersError {
}
}
impl From<GetReadyAncestorError> for PageReconstructError {
fn from(e: GetReadyAncestorError) -> Self {
match e {
GetReadyAncestorError::AncestorStopping(tid) => {
PageReconstructError::AncestorStopping(tid)
}
GetReadyAncestorError::AncestorLsnTimeout(wait_err) => {
PageReconstructError::AncestorLsnTimeout(wait_err)
}
GetReadyAncestorError::Cancelled => PageReconstructError::Cancelled,
GetReadyAncestorError::Other(other) => PageReconstructError::Other(other),
}
}
}
/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
@@ -624,6 +663,7 @@ impl Timeline {
.await?;
timer.stop_and_record();
// info!("Sequential reconstruct state for {key} {reconstruct_state:?}");
let start = Instant::now();
let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
let elapsed = start.elapsed();
@@ -631,29 +671,27 @@ impl Timeline {
.for_result(&res)
.observe(elapsed.as_secs_f64());
if cfg!(feature = "testing") && res.is_err() {
// it can only be walredo issue
use std::fmt::Write;
use std::fmt::Write;
let mut msg = String::new();
let mut msg = String::new();
path.into_iter().for_each(|(res, cont_lsn, layer)| {
writeln!(
msg,
"- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
layer(),
)
.expect("string grows")
});
path.into_iter().for_each(|(res, cont_lsn, layer)| {
writeln!(
msg,
"- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
layer(),
)
.expect("string grows")
});
// this is to rule out or provide evidence that we could in some cases read a duplicate
// walrecord
tracing::info!("walredo failed, path:\n{msg}");
}
info!("Path taken: {msg}");
res
}
pub(crate) fn set_get_vectored_impl(&self, impl_type: GetVectoredImpl) {
*self.get_vectored_impl.write().unwrap() = impl_type;
}
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
/// Look up multiple page versions at a given LSN
@@ -678,12 +716,66 @@ impl Timeline {
return Err(GetVectoredError::Oversized(key_count));
}
let _timer = crate::metrics::GET_VECTORED_LATENCY
.for_task_kind(ctx.task_kind())
.map(|t| t.start_timer());
for range in key_ranges {
let mut key = range.start;
while key != range.end {
assert!(!self.shard_identity.is_key_disposable(&key));
key = key.next();
}
}
// TODO: update api above to take KeySpace
let keyspace = KeySpace {
ranges: Vec::from(key_ranges),
};
let which = *self.get_vectored_impl.read().unwrap();
match which {
GetVectoredImpl::Sequential => {
self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
}
GetVectoredImpl::Vectored => {
let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await?;
let sequential_res = self
.get_vectored_sequential_impl(keyspace, lsn, ctx)
.await
.unwrap();
vectored_res.iter().zip(sequential_res.iter()).for_each(
|((seq_key, seq_res), (vec_key, vec_res))| {
assert_eq!(seq_key, vec_key, "{} != {}", seq_key, vec_key);
match (seq_res, vec_res) {
(Ok(seq_blob), Ok(vec_blob)) => {
assert_eq!(seq_blob, vec_blob, "Wrong image for key {}", seq_key)
}
_ => {
error!(
"Unexpected error for key={}: seq_res={:?} vec_res={:?}",
seq_key,
seq_res.as_ref().map(|_| "Ok(...)"),
vec_res.as_ref().map(|_| "Ok(...)")
);
panic!()
}
}
},
);
Ok(vectored_res)
}
}
}
async fn get_vectored_sequential_impl(
&self,
keyspace: KeySpace,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
trace!("get_vectored_sequential_impl range={keyspace:?}");
let mut values = BTreeMap::new();
for range in key_ranges {
for range in keyspace.ranges {
let mut key = range.start;
while key != range.end {
assert!(!self.shard_identity.is_key_disposable(&key));
@@ -705,6 +797,46 @@ impl Timeline {
Ok(values)
}
async fn get_vectored_impl(
&self,
keyspace: KeySpace,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
trace!("get_vectored_impl range={keyspace:?}");
for range in &keyspace.ranges {
let mut key = range.start;
while key != range.end {
assert!(!self.shard_identity.is_key_disposable(&key));
key = key.next();
}
}
let mut reconstruct_state = ValuesReconstructState::new();
self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
.await?;
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
for (key, res) in reconstruct_state.keys {
match res {
Err(err) => {
results.insert(key, Err(err));
}
Ok(state) => {
let state = ValueReconstructState::from(state);
// info!("Vectored reconstruct state for {key} {state:?}");
let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
results.insert(key, reconstruct_res);
}
}
}
Ok(results)
}
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
pub fn get_last_record_lsn(&self) -> Lsn {
self.last_record_lsn.load().last
@@ -1499,6 +1631,7 @@ impl Timeline {
compaction_lock: tokio::sync::Mutex::default(),
gc_lock: tokio::sync::Mutex::default(),
get_vectored_impl: RwLock::new(GetVectoredImpl::default()),
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2345,7 +2478,14 @@ impl Timeline {
}
// The function should have updated 'state'
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
info!(
"CALLED for {} at {}: {:?} with {} records, cached {}",
key,
cont_lsn,
result,
reconstruct_state.records.len(),
cached_lsn
);
match result {
ValueReconstructResult::Complete => return Ok(traversal_path),
ValueReconstructResult::Continue => {
@@ -2392,60 +2532,8 @@ impl Timeline {
timeline.ancestor_lsn,
cont_lsn
);
let ancestor = match timeline.get_ancestor_timeline() {
Ok(timeline) => timeline,
Err(e) => return Err(PageReconstructError::from(e)),
};
// It's possible that the ancestor timeline isn't active yet, or
// is active but hasn't yet caught up to the branch point. Wait
// for it.
//
// This cannot happen while the pageserver is running normally,
// because you cannot create a branch from a point that isn't
// present in the pageserver yet. However, we don't wait for the
// branch point to be uploaded to cloud storage before creating
// a branch. I.e., the branch LSN need not be remote consistent
// for the branching operation to succeed.
//
// Hence, if we try to load a tenant in such a state where
// 1. the existence of the branch was persisted (in IndexPart and/or locally)
// 2. but the ancestor state is behind branch_lsn because it was not yet persisted
// then we will need to wait for the ancestor timeline to
// re-stream WAL up to branch_lsn before we access it.
//
// How can a tenant get in such a state?
// - ungraceful pageserver process exit
// - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
//
// NB: this could be avoided by requiring
// branch_lsn >= remote_consistent_lsn
// during branch creation.
match ancestor.wait_to_become_active(ctx).await {
Ok(()) => {}
Err(TimelineState::Stopping) => {
return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
}
Err(state) => {
return Err(PageReconstructError::Other(anyhow::anyhow!(
"Timeline {} will not become active. Current state: {:?}",
ancestor.timeline_id,
&state,
)));
}
}
ancestor
.wait_lsn(timeline.ancestor_lsn, ctx)
.await
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
WaitLsnError::Shutdown => PageReconstructError::Cancelled,
e @ WaitLsnError::BadState => {
PageReconstructError::Other(anyhow::anyhow!(e))
}
})?;
timeline_owned = ancestor;
timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
timeline = &*timeline_owned;
prev_lsn = Lsn(u64::MAX);
continue 'outer;
@@ -2459,7 +2547,7 @@ impl Timeline {
if let Some(open_layer) = &layers.open_layer {
let start_lsn = open_layer.get_lsn_range().start;
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
// info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, start_lsn);
@@ -2492,6 +2580,7 @@ impl Timeline {
let start_lsn = frozen_layer.get_lsn_range().start;
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
//how
let lsn_floor = max(cached_lsn + 1, start_lsn);
result = match frozen_layer
.get_value_reconstruct_data(
@@ -2555,6 +2644,135 @@ impl Timeline {
}
}
async fn get_vectored_reconstruct_data(
&self,
keyspace: KeySpace,
request_lsn: Lsn,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
// Start from the current timeline.
let mut timeline_owned: Arc<Timeline>;
let mut timeline = self;
let mut unmapped_keyspace = keyspace.clone();
let mut cont_lsn = Lsn(request_lsn.0 + 1);
let mut fringe = LayerFringe::new();
let initial_keyspace_size = keyspace.total_size();
while reconstruct_state.total_keys_done < initial_keyspace_size {
if self.cancel.is_cancelled() {
return Err(GetVectoredError::Cancelled);
}
// 1. Check if all keys are done and return if true
// 2. Split key ranges up to account for gaps left by finished keys
// 3. Switch timelines if needed
// 4. Find the next layer for each unmapped range. This can be open, frozen or disk
// 5. For the latest layer, inspect it and put all the required data in the bag - pop
// the layer from the heap.
// 6. Mark the ranges that previously mapped to this layer as unmapped
// 7. Go back to 1
let keys_done_last_step = reconstruct_state.consume_done_keys();
trace!("keys_done_last_step={keys_done_last_step:?}");
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
trace!(
"going into ancestor {}, cont_lsn is {}",
timeline.ancestor_lsn,
cont_lsn
);
timeline_owned = timeline
.get_ready_ancestor_timeline(ctx)
.await
.map_err(GetVectoredError::GetReadyAncestorError)?;
timeline = &*timeline_owned;
}
let guard = timeline.layers.read().await;
let layers = guard.layer_map();
let in_memory_layer = layers.find_in_memory_layer(|l| {
let start_lsn = l.get_lsn_range().start;
cont_lsn > start_lsn
});
match in_memory_layer {
Some(l) => {
fringe.update(ReadableLayerDesc::InMemory(l), unmapped_keyspace.clone());
}
None => {
for range in unmapped_keyspace.ranges.iter() {
let results = layers
.range_search(range.clone(), cont_lsn)
.ok_or_else(|| GetVectoredError::InvalidLsn(cont_lsn))?;
trace!(
"Range search for {:?} at {} returned {:?}",
range,
cont_lsn,
results
);
results
.found
.into_iter()
.map(|(res, accum)| {
(
ReadableLayerDesc::Persistent {
desc: (*res.0.layer).clone(),
lsn_floor: res.0.lsn_floor,
},
accum.to_keyspace(),
)
})
.for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
let not_found = &results.not_found.to_keyspace();
if let Some(first_missing_key) = not_found.start() {
return Err(GetVectoredError::MissingKey(first_missing_key));
}
}
}
}
trace!("fringe={fringe:?}");
if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
trace!(
"Handling layer {:?} for keyspace {:?}",
layer_to_read,
keyspace_to_read
);
layer_to_read
.get_values_reconstruct_data(
&guard,
keyspace_to_read.clone(),
cont_lsn,
reconstruct_state,
ctx,
)
.await?;
unmapped_keyspace = keyspace_to_read;
cont_lsn = layer_to_read.get_lsn_floor();
trace!(
"Handled layer {:?}. cont_lsn={}, unmapped_keyspace={:?}",
layer_to_read,
cont_lsn,
unmapped_keyspace
);
}
}
Ok(())
}
/// # Cancel-safety
///
/// This method is cancellation-safe.
@@ -2575,6 +2793,66 @@ impl Timeline {
Some((lsn, img))
}
async fn get_ready_ancestor_timeline(
&self,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, GetReadyAncestorError> {
let ancestor = match self.get_ancestor_timeline() {
Ok(timeline) => timeline,
Err(e) => return Err(GetReadyAncestorError::from(e)),
};
// It's possible that the ancestor timeline isn't active yet, or
// is active but hasn't yet caught up to the branch point. Wait
// for it.
//
// This cannot happen while the pageserver is running normally,
// because you cannot create a branch from a point that isn't
// present in the pageserver yet. However, we don't wait for the
// branch point to be uploaded to cloud storage before creating
// a branch. I.e., the branch LSN need not be remote consistent
// for the branching operation to succeed.
//
// Hence, if we try to load a tenant in such a state where
// 1. the existence of the branch was persisted (in IndexPart and/or locally)
// 2. but the ancestor state is behind branch_lsn because it was not yet persisted
// then we will need to wait for the ancestor timeline to
// re-stream WAL up to branch_lsn before we access it.
//
// How can a tenant get in such a state?
// - ungraceful pageserver process exit
// - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
//
// NB: this could be avoided by requiring
// branch_lsn >= remote_consistent_lsn
// during branch creation.
match ancestor.wait_to_become_active(ctx).await {
Ok(()) => {}
Err(TimelineState::Stopping) => {
return Err(GetReadyAncestorError::AncestorStopping(
ancestor.timeline_id,
));
}
Err(state) => {
return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
"Timeline {} will not become active. Current state: {:?}",
ancestor.timeline_id,
&state,
)));
}
}
ancestor
.wait_lsn(self.ancestor_lsn, ctx)
.await
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
})?;
Ok(ancestor)
}
fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
format!(
@@ -4404,7 +4682,7 @@ impl Timeline {
.walredo_mgr
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
.await
.context("reconstruct a page image")
.context("Failed to reconstruct a page image:")
{
Ok(img) => img,
Err(e) => return Err(PageReconstructError::WalRedo(e)),

View File

@@ -126,27 +126,6 @@ pub(super) struct UploadQueueStopped {
pub(super) deleted_at: SetDeletedFlagProgress,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum NotInitialized {
#[error("queue is in state Uninitialized")]
Uninitialized,
#[error("queue is in state Stopping")]
Stopped,
#[error("queue is shutting down")]
ShuttingDown,
}
impl NotInitialized {
pub(crate) fn is_stopping(&self) -> bool {
use NotInitialized::*;
match self {
Uninitialized => false,
Stopped => true,
ShuttingDown => true,
}
}
}
impl UploadQueue {
pub(crate) fn initialize_empty_remote(
&mut self,
@@ -235,17 +214,17 @@ impl UploadQueue {
}
pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
use UploadQueue::*;
match self {
Uninitialized => Err(NotInitialized::Uninitialized.into()),
Initialized(x) => {
if x.shutting_down {
Err(NotInitialized::ShuttingDown.into())
} else {
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
anyhow::bail!("queue is in state {}", self.as_str())
}
UploadQueue::Initialized(x) => {
if !x.shutting_down {
Ok(x)
} else {
anyhow::bail!("queue is shutting down")
}
}
Stopped(_) => Err(NotInitialized::Stopped.into()),
}
}

View File

@@ -11,28 +11,18 @@
//! src/backend/storage/file/fd.c
//!
use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
use crate::page_cache::PageWriteGuard;
use crate::tenant::TENANTS_SEGMENT_NAME;
use camino::{Utf8Path, Utf8PathBuf};
use once_cell::sync::OnceCell;
use pageserver_api::shard::TenantShardId;
use std::fs::{self, File};
use std::fs::{self, File, OpenOptions};
use std::io::{Error, ErrorKind, Seek, SeekFrom};
use tokio_epoll_uring::IoBufMut;
use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
use std::os::unix::fs::FileExt;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tokio::time::Instant;
use utils::fs_ext;
mod io_engine;
mod open_options;
pub use io_engine::IoEngineKind;
pub(crate) use open_options::*;
///
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
/// the underlying file is closed if the system is low on file descriptors,
@@ -116,38 +106,7 @@ struct SlotInner {
tag: u64,
/// the underlying file
file: Option<OwnedFd>,
}
/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
struct PageWriteGuardBuf {
page: PageWriteGuard<'static>,
init_up_to: usize,
}
// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
fn stable_ptr(&self) -> *const u8 {
self.page.as_ptr()
}
fn bytes_init(&self) -> usize {
self.init_up_to
}
fn bytes_total(&self) -> usize {
self.page.len()
}
}
// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access,
// hence it's safe to hand out the `stable_mut_ptr()`.
unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
fn stable_mut_ptr(&mut self) -> *mut u8 {
self.page.as_mut_ptr()
}
unsafe fn set_init(&mut self, pos: usize) {
assert!(pos <= self.page.len());
self.init_up_to = pos;
}
file: Option<File>,
}
impl OpenFiles {
@@ -315,10 +274,6 @@ macro_rules! with_file {
let $ident = $this.lock_file().await?;
observe_duration!($op, $($body)*)
}};
($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{
let mut $ident = $this.lock_file().await?;
observe_duration!($op, $($body)*)
}};
}
impl VirtualFile {
@@ -371,9 +326,7 @@ impl VirtualFile {
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
// where our caller doesn't get to use the returned VirtualFile before its
// slot gets re-used by someone else.
let file = observe_duration!(StorageIoOperation::Open, {
open_options.open(path.as_std_path()).await?
});
let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
// Strip all options other than read and write.
//
@@ -403,12 +356,7 @@ impl VirtualFile {
Ok(vfile)
}
/// Writes a file to the specified `final_path` in a crash safe fasion
///
/// The file is first written to the specified tmp_path, and in a second
/// step, the tmp path is renamed to the final path. As renames are
/// atomic, a crash during the write operation will never leave behind a
/// partially written file.
/// Async & [`VirtualFile`]-enabled version of [`::utils::crashsafe::overwrite`].
pub async fn crashsafe_overwrite(
final_path: &Utf8Path,
tmp_path: &Utf8Path,
@@ -447,13 +395,15 @@ impl VirtualFile {
/// Call File::sync_all() on the underlying File.
pub async fn sync_all(&self) -> Result<(), Error> {
with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
.with_std_file(|std_file| std_file.sync_all()))
with_file!(self, StorageIoOperation::Fsync, |file| file
.as_ref()
.sync_all())
}
pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
.with_std_file(|std_file| std_file.metadata()))
with_file!(self, StorageIoOperation::Metadata, |file| file
.as_ref()
.metadata())
}
/// Helper function internal to `VirtualFile` that looks up the underlying File,
@@ -462,7 +412,7 @@ impl VirtualFile {
///
/// We are doing it via a macro as Rust doesn't support async closures that
/// take on parameters with lifetimes.
async fn lock_file(&self) -> Result<FileGuard, Error> {
async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
let open_files = get_open_files();
let mut handle_guard = {
@@ -508,9 +458,10 @@ impl VirtualFile {
// NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
// case from StorageIoOperation::Open. This helps with identifying thrashing
// of the virtual file descriptor cache.
let file = observe_duration!(StorageIoOperation::OpenAfterReplace, {
self.open_options.open(self.path.as_std_path()).await?
});
let file = observe_duration!(
StorageIoOperation::OpenAfterReplace,
self.open_options.open(&self.path)
)?;
// Store the File in the slot and update the handle in the VirtualFile
// to point to it.
@@ -535,8 +486,9 @@ impl VirtualFile {
self.pos = offset;
}
SeekFrom::End(offset) => {
self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard
.with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))?
self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
.as_ref()
.seek(SeekFrom::End(offset)))?
}
SeekFrom::Current(offset) => {
let pos = self.pos as i128 + offset as i128;
@@ -555,28 +507,25 @@ impl VirtualFile {
Ok(self.pos)
}
pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
where
B: IoBufMut + Send,
{
let (buf, res) =
read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
res.map(|()| buf)
}
/// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
pub async fn read_exact_at_page(
&self,
page: PageWriteGuard<'static>,
offset: u64,
) -> Result<PageWriteGuard<'static>, Error> {
let buf = PageWriteGuardBuf {
page,
init_up_to: 0,
};
let res = self.read_exact_at(buf, offset).await;
res.map(|PageWriteGuardBuf { page, .. }| page)
.map_err(|e| Error::new(ErrorKind::Other, e))
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
while !buf.is_empty() {
match self.read_at(buf, offset).await {
Ok(0) => {
return Err(Error::new(
std::io::ErrorKind::UnexpectedEof,
"failed to fill whole buffer",
))
}
Ok(n) => {
buf = &mut buf[n..];
offset += n as u64;
}
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
Ok(())
}
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
@@ -626,35 +575,22 @@ impl VirtualFile {
Ok(n)
}
pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
where
B: tokio_epoll_uring::BoundedBufMut + Send,
{
let file_guard = match self.lock_file().await {
Ok(file_guard) => file_guard,
Err(e) => return (buf, Err(e)),
};
observe_duration!(StorageIoOperation::Read, {
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
if let Ok(size) = res {
STORAGE_IO_SIZE
.with_label_values(&[
"read",
&self.tenant_id,
&self.shard_id,
&self.timeline_id,
])
.add(size as i64);
}
(buf, res)
})
pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
let result = with_file!(self, StorageIoOperation::Read, |file| file
.as_ref()
.read_at(buf, offset));
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
.add(size as i64);
}
result
}
async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
});
let result = with_file!(self, StorageIoOperation::Write, |file| file
.as_ref()
.write_at(buf, offset));
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
@@ -664,241 +600,18 @@ impl VirtualFile {
}
}
// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
pub async fn read_exact_at_impl<B, F, Fut>(
buf: B,
mut offset: u64,
mut read_at: F,
) -> (B, std::io::Result<()>)
where
B: IoBufMut + Send,
F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
{
use tokio_epoll_uring::BoundedBuf;
let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
while buf.bytes_total() != 0 {
let res;
(buf, res) = read_at(buf, offset).await;
match res {
Ok(0) => break,
Ok(n) => {
buf = buf.slice(n..);
offset += n as u64;
}
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
Err(e) => return (buf.into_inner(), Err(e)),
}
}
// NB: don't use `buf.is_empty()` here; it is from the
// `impl Deref for Slice { Target = [u8] }`; the the &[u8]
// returned by it only covers the initialized portion of `buf`.
// Whereas we're interested in ensuring that we filled the entire
// buffer that the user passed in.
if buf.bytes_total() != 0 {
(
buf.into_inner(),
Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"failed to fill whole buffer",
)),
)
} else {
assert_eq!(buf.len(), buf.bytes_total());
(buf.into_inner(), Ok(()))
}
struct FileGuard<'a> {
slot_guard: RwLockReadGuard<'a, SlotInner>,
}
#[cfg(test)]
mod test_read_exact_at_impl {
use std::{collections::VecDeque, sync::Arc};
use tokio_epoll_uring::{BoundedBuf, BoundedBufMut};
use super::read_exact_at_impl;
struct Expectation {
offset: u64,
bytes_total: usize,
result: std::io::Result<Vec<u8>>,
}
struct MockReadAt {
expectations: VecDeque<Expectation>,
}
impl MockReadAt {
async fn read_at(
&mut self,
mut buf: tokio_epoll_uring::Slice<Vec<u8>>,
offset: u64,
) -> (tokio_epoll_uring::Slice<Vec<u8>>, std::io::Result<usize>) {
let exp = self
.expectations
.pop_front()
.expect("read_at called but we have no expectations left");
assert_eq!(exp.offset, offset);
assert_eq!(exp.bytes_total, buf.bytes_total());
match exp.result {
Ok(bytes) => {
assert!(bytes.len() <= buf.bytes_total());
buf.put_slice(&bytes);
(buf, Ok(bytes.len()))
}
Err(e) => (buf, Err(e)),
}
}
}
impl Drop for MockReadAt {
fn drop(&mut self) {
assert_eq!(self.expectations.len(), 0);
}
}
#[tokio::test]
async fn test_basic() {
let buf = Vec::with_capacity(5);
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
expectations: VecDeque::from(vec![Expectation {
offset: 0,
bytes_total: 5,
result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
}]),
}));
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
.await;
assert!(res.is_ok());
assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
}
#[tokio::test]
async fn test_empty_buf_issues_no_syscall() {
let buf = Vec::new();
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
expectations: VecDeque::new(),
}));
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
.await;
assert!(res.is_ok());
}
#[tokio::test]
async fn test_two_read_at_calls_needed_until_buf_filled() {
let buf = Vec::with_capacity(4);
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
expectations: VecDeque::from(vec![
Expectation {
offset: 0,
bytes_total: 4,
result: Ok(vec![b'a', b'b']),
},
Expectation {
offset: 2,
bytes_total: 2,
result: Ok(vec![b'c', b'd']),
},
]),
}));
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
.await;
assert!(res.is_ok());
assert_eq!(buf, vec![b'a', b'b', b'c', b'd']);
}
#[tokio::test]
async fn test_eof_before_buffer_full() {
let buf = Vec::with_capacity(3);
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
expectations: VecDeque::from(vec![
Expectation {
offset: 0,
bytes_total: 3,
result: Ok(vec![b'a']),
},
Expectation {
offset: 1,
bytes_total: 2,
result: Ok(vec![b'b']),
},
Expectation {
offset: 2,
bytes_total: 1,
result: Ok(vec![]),
},
]),
}));
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
.await;
let Err(err) = res else {
panic!("should return an error");
};
assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof);
assert_eq!(format!("{err}"), "failed to fill whole buffer");
// buffer contents on error are unspecified
}
}
struct FileGuard {
slot_guard: RwLockReadGuard<'static, SlotInner>,
}
impl AsRef<OwnedFd> for FileGuard {
fn as_ref(&self) -> &OwnedFd {
impl<'a> AsRef<File> for FileGuard<'a> {
fn as_ref(&self) -> &File {
// This unwrap is safe because we only create `FileGuard`s
// if we know that the file is Some.
self.slot_guard.file.as_ref().unwrap()
}
}
impl FileGuard {
/// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
fn with_std_file<F, R>(&self, with: F) -> R
where
F: FnOnce(&File) -> R,
{
// SAFETY:
// - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
// - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut`
let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
let res = with(&file);
let _ = file.into_raw_fd();
res
}
/// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
fn with_std_file_mut<F, R>(&mut self, with: F) -> R
where
F: FnOnce(&mut File) -> R,
{
// SAFETY:
// - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
// - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd
let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
let res = with(&mut file);
let _ = file.into_raw_fd();
res
}
}
impl tokio_epoll_uring::IoFd for FileGuard {
unsafe fn as_fd(&self) -> RawFd {
let owned_fd: &OwnedFd = self.as_ref();
owned_fd.as_raw_fd()
}
}
#[cfg(test)]
impl VirtualFile {
pub(crate) async fn read_blk(
@@ -906,19 +619,16 @@ impl VirtualFile {
blknum: u32,
) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
use crate::page_cache::PAGE_SZ;
let buf = vec![0; PAGE_SZ];
let buf = self
.read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
let mut buf = [0; PAGE_SZ];
self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
.await?;
Ok(crate::tenant::block_io::BlockLease::Vec(buf))
Ok(std::sync::Arc::new(buf).into())
}
async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
let mut tmp = vec![0; 128];
loop {
let res;
(tmp, res) = self.read_at(tmp, self.pos).await;
match res {
let mut tmp = [0; 128];
match self.read_at(&mut tmp, self.pos).await {
Ok(0) => return Ok(()),
Ok(n) => {
self.pos += n as u64;
@@ -994,12 +704,10 @@ impl OpenFiles {
/// Initialize the virtual file module. This must be called once at page
/// server startup.
///
#[cfg(not(test))]
pub fn init(num_slots: usize, engine: IoEngineKind) {
pub fn init(num_slots: usize) {
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
panic!("virtual_file::init called twice");
}
io_engine::init(engine);
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
}
@@ -1044,10 +752,10 @@ mod tests {
}
impl MaybeVirtualFile {
async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
match self {
MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset),
}
}
async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
@@ -1089,14 +797,14 @@ mod tests {
// Helper function to slurp a portion of a file into a string
async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
let buf = vec![0; len];
let buf = self.read_exact_at(buf, pos).await?;
let mut buf = vec![0; len];
self.read_exact_at(&mut buf, pos).await?;
Ok(String::from_utf8(buf).unwrap())
}
}
#[tokio::test]
async fn test_virtual_files() -> anyhow::Result<()> {
async fn test_virtual_files() -> Result<(), Error> {
// The real work is done in the test_files() helper function. This
// allows us to run the same set of tests against a native File, and
// VirtualFile. We trust the native Files and wouldn't need to test them,
@@ -1112,17 +820,14 @@ mod tests {
}
#[tokio::test]
async fn test_physical_files() -> anyhow::Result<()> {
async fn test_physical_files() -> Result<(), Error> {
test_files("physical_files", |path, open_options| async move {
Ok(MaybeVirtualFile::File({
let owned_fd = open_options.open(path.as_std_path()).await?;
File::from(owned_fd)
}))
Ok(MaybeVirtualFile::File(open_options.open(path)?))
})
.await
}
async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
where
OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
@@ -1266,11 +971,11 @@ mod tests {
for _threadno in 0..THREADS {
let files = files.clone();
let hdl = rt.spawn(async move {
let mut buf = vec![0u8; SIZE];
let mut buf = [0u8; SIZE];
let mut rng = rand::rngs::OsRng;
for _ in 1..1000 {
let f = &files[rng.gen_range(0..files.len())];
buf = f.read_exact_at(buf, 0).await.unwrap();
f.read_exact_at(&mut buf, 0).await.unwrap();
assert!(buf == SAMPLE);
}
});

View File

@@ -1,114 +0,0 @@
//! [`super::VirtualFile`] supports different IO engines.
//!
//! The [`IoEngineKind`] enum identifies them.
//!
//! The choice of IO engine is global.
//! Initialize using [`init`].
//!
//! Then use [`get`] and [`super::OpenOptions`].
#[derive(
Copy,
Clone,
PartialEq,
Eq,
Hash,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
Debug,
)]
#[strum(serialize_all = "kebab-case")]
pub enum IoEngineKind {
StdFs,
#[cfg(target_os = "linux")]
TokioEpollUring,
}
static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
#[cfg(not(test))]
pub(super) fn init(engine: IoEngineKind) {
if IO_ENGINE.set(engine).is_err() {
panic!("called twice");
}
crate::metrics::virtual_file_io_engine::KIND
.with_label_values(&[&format!("{engine}")])
.set(1);
}
pub(super) fn get() -> &'static IoEngineKind {
#[cfg(test)]
{
let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
Ok(v) => match v.parse::<IoEngineKind>() {
Ok(engine_kind) => engine_kind,
Err(e) => {
panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
}
},
Err(std::env::VarError::NotPresent) => {
crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
.parse()
.unwrap()
}
Err(std::env::VarError::NotUnicode(_)) => {
panic!("env var {env_var_name} is not unicode");
}
})
}
#[cfg(not(test))]
IO_ENGINE.get().unwrap()
}
use std::os::unix::prelude::FileExt;
use super::FileGuard;
impl IoEngineKind {
pub(super) async fn read_at<B>(
&self,
file_guard: FileGuard,
offset: u64,
mut buf: B,
) -> ((FileGuard, B), std::io::Result<usize>)
where
B: tokio_epoll_uring::BoundedBufMut + Send,
{
match self {
IoEngineKind::StdFs => {
// SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
let dst = unsafe {
std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
};
let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
if let Ok(nbytes) = &res {
assert!(*nbytes <= buf.bytes_total());
// SAFETY: see above assertion
unsafe {
buf.set_init(*nbytes);
}
}
#[allow(dropping_references)]
drop(dst);
((file_guard, buf), res)
}
#[cfg(target_os = "linux")]
IoEngineKind::TokioEpollUring => {
let system = tokio_epoll_uring::thread_local_system().await;
let (resources, res) = system.read(file_guard, offset, buf).await;
(
resources,
res.map_err(|e| match e {
tokio_epoll_uring::Error::Op(e) => e,
tokio_epoll_uring::Error::System(system) => {
std::io::Error::new(std::io::ErrorKind::Other, system)
}
}),
)
}
}
}
}

View File

@@ -1,138 +0,0 @@
//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
use super::IoEngineKind;
use std::{os::fd::OwnedFd, path::Path};
#[derive(Debug, Clone)]
pub enum OpenOptions {
StdFs(std::fs::OpenOptions),
#[cfg(target_os = "linux")]
TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions),
}
impl Default for OpenOptions {
fn default() -> Self {
match super::io_engine::get() {
IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
#[cfg(target_os = "linux")]
IoEngineKind::TokioEpollUring => {
Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
}
}
}
}
impl OpenOptions {
pub fn new() -> OpenOptions {
Self::default()
}
pub fn read(&mut self, read: bool) -> &mut OpenOptions {
match self {
OpenOptions::StdFs(x) => {
let _ = x.read(read);
}
#[cfg(target_os = "linux")]
OpenOptions::TokioEpollUring(x) => {
let _ = x.read(read);
}
}
self
}
pub fn write(&mut self, write: bool) -> &mut OpenOptions {
match self {
OpenOptions::StdFs(x) => {
let _ = x.write(write);
}
#[cfg(target_os = "linux")]
OpenOptions::TokioEpollUring(x) => {
let _ = x.write(write);
}
}
self
}
pub fn create(&mut self, create: bool) -> &mut OpenOptions {
match self {
OpenOptions::StdFs(x) => {
let _ = x.create(create);
}
#[cfg(target_os = "linux")]
OpenOptions::TokioEpollUring(x) => {
let _ = x.create(create);
}
}
self
}
pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions {
match self {
OpenOptions::StdFs(x) => {
let _ = x.create_new(create_new);
}
#[cfg(target_os = "linux")]
OpenOptions::TokioEpollUring(x) => {
let _ = x.create_new(create_new);
}
}
self
}
pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions {
match self {
OpenOptions::StdFs(x) => {
let _ = x.truncate(truncate);
}
#[cfg(target_os = "linux")]
OpenOptions::TokioEpollUring(x) => {
let _ = x.truncate(truncate);
}
}
self
}
pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result<OwnedFd> {
match self {
OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
#[cfg(target_os = "linux")]
OpenOptions::TokioEpollUring(x) => {
let system = tokio_epoll_uring::thread_local_system().await;
system.open(path, x).await.map_err(|e| match e {
tokio_epoll_uring::Error::Op(e) => e,
tokio_epoll_uring::Error::System(system) => {
std::io::Error::new(std::io::ErrorKind::Other, system)
}
})
}
}
}
}
impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
fn mode(&mut self, mode: u32) -> &mut OpenOptions {
match self {
OpenOptions::StdFs(x) => {
let _ = x.mode(mode);
}
#[cfg(target_os = "linux")]
OpenOptions::TokioEpollUring(x) => {
let _ = x.mode(mode);
}
}
self
}
fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions {
match self {
OpenOptions::StdFs(x) => {
let _ = x.custom_flags(flags);
}
#[cfg(target_os = "linux")]
OpenOptions::TokioEpollUring(x) => {
let _ = x.custom_flags(flags);
}
}
self
}
}

View File

@@ -26,11 +26,8 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
use std::str::FromStr;
use anyhow::{bail, Context, Result};
use bytes::{Buf, Bytes, BytesMut};
use hex::FromHex;
use tracing::*;
use utils::failpoint_support;
@@ -47,10 +44,9 @@ use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
use postgres_ffi::v14::xlog_utils::*;
use postgres_ffi::v14::{bindings::FullTransactionId, CheckPoint};
use postgres_ffi::v14::CheckPoint;
use postgres_ffi::TransactionId;
use postgres_ffi::BLCKSZ;
use utils::id::TenantId;
use utils::lsn::Lsn;
pub struct WalIngest {
@@ -113,43 +109,6 @@ impl WalIngest {
self.checkpoint_modified = true;
}
// BEGIN ONE-OFF HACK, version 3
//
// We had a bug where we incorrectly passed 0 to update_next_xid(). That was
// harmless as long as nextXid was < 2^31, because 0 looked like a very old
// XID. But once nextXid reaches 2^31, 0 starts to look like a very new XID, and
// we incorrectly bumped up nextXid to the next epoch, to value '1:1024'
//
// That bug was fixed in commits e4898a6e605e791a00ce21bf49d4cc0d9a10534a and
// c1148dc9acf938d912888ecb0a4e76ed40e21ef8, but we have one known timeline in
// production where that already happened. This is a one-off fix to fix that
// damage.
//
// So on that particular timeline, fix the incorrectly set nextXid to the XID from
// the next record we see, plus 10000 to give some safety margin.
//
// As a safety measure, disable this hack after they have reached LSN 380/00000000.
// As of this writing, they are around LSN 36D/00000000. They should not reach
// real XID wraparound until this LSN. We should remove this hack from production
// before that happens anyway, but better safe than sorry.
//
if self.checkpoint.nextXid.value == 4294968320 && // 1::1024, the incorrect value
// only apply this on the one broken tenant
modification.tline.tenant_shard_id.tenant_id == TenantId::from_hex("df254570a4f603805528b46b0d45a76c").unwrap() &&
lsn < Lsn::from_str("380/00000000").unwrap() &&
decoded.xl_xid != pg_constants::INVALID_TRANSACTION_ID
{
self.checkpoint.nextXid = FullTransactionId {
value: (decoded.xl_xid + 10000) as u64,
};
self.checkpoint_modified = true;
warn!(
"nextXid fixed by one-off hack at LSN {}, nextXid is now {}",
lsn, self.checkpoint.nextXid.value
);
}
// END ONE-OFF HACK
match decoded.xl_rmid {
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
// Heap AM records need some special handling, because they modify VM pages
@@ -1074,23 +1033,7 @@ impl WalIngest {
// Copy content
debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
for blknum in 0..nblocks {
// Sharding:
// - src and dst are always on the same shard, because they differ only by dbNode, and
// dbNode is not included in the hash inputs for sharding.
// - This WAL command is replayed on all shards, but each shard only copies the blocks
// that belong to it.
let src_key = rel_block_to_key(src_rel, blknum);
if !self.shard.is_key_local(&src_key) {
debug!(
"Skipping non-local key {} during XLOG_DBASE_CREATE",
src_key
);
continue;
}
debug!(
"copying block {} from {} ({}) to {}",
blknum, src_rel, src_key, dst_rel
);
debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
let content = modification
.tline
@@ -1404,22 +1347,16 @@ impl WalIngest {
self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
self.checkpoint_modified = true;
}
let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
if let Some(max_xid) = acc {
if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
Some(mbr.xid)
} else {
acc
}
let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
if mbr.xid.wrapping_sub(acc) as i32 > 0 {
mbr.xid
} else {
Some(mbr.xid)
acc
}
});
if let Some(max_xid) = max_mbr_xid {
if self.checkpoint.update_next_xid(max_xid) {
self.checkpoint_modified = true;
}
if self.checkpoint.update_next_xid(max_mbr_xid) {
self.checkpoint_modified = true;
}
Ok(())
}

View File

@@ -15,7 +15,6 @@
#include "postgres.h"
#include "access/xlog.h"
#include "common/hashfn.h"
#include "fmgr.h"
#include "libpq-fe.h"
#include "libpq/libpq.h"
@@ -39,6 +38,17 @@
#define MIN_RECONNECT_INTERVAL_USEC 1000
#define MAX_RECONNECT_INTERVAL_USEC 1000000
bool connected = false;
PGconn *pageserver_conn = NULL;
/*
* WaitEventSet containing:
* - WL_SOCKET_READABLE on pageserver_conn,
* - WL_LATCH_SET on MyLatch, and
* - WL_EXIT_ON_PM_DEATH.
*/
WaitEventSet *pageserver_conn_wes = NULL;
/* GUCs */
char *neon_timeline;
char *neon_tenant;
@@ -49,25 +59,17 @@ char *neon_auth_token;
int readahead_buffer_size = 128;
int flush_every_n_requests = 8;
static int n_reconnect_attempts = 0;
static int max_reconnect_attempts = 60;
static int stripe_size;
static int n_reconnect_attempts = 0;
static int max_reconnect_attempts = 60;
typedef struct
{
char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
size_t num_shards;
} ShardMap;
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
/*
* PagestoreShmemState is kept in shared memory. It contains the connection
* strings for each shard.
*
* The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option,
* allowing it to be changed using pg_reload_conf(). The control plane can
* update the connection string if the pageserver crashes, is relocated, or
* new shards are added. A parsed copy of the current value of the GUC is kept
* in shared memory, updated by the postmaster, because regular backends don't
* new shards are added. A copy of the current value of the GUC is kept in
* shared memory, updated by the postmaster, because regular backends don't
* reload the config during query execution, but we might need to re-establish
* the pageserver connection with the new connection string even in the middle
* of a query.
@@ -82,7 +84,7 @@ typedef struct
{
pg_atomic_uint64 begin_update_counter;
pg_atomic_uint64 end_update_counter;
ShardMap shard_map;
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
} PagestoreShmemState;
#if PG_VERSION_NUM >= 150000
@@ -92,25 +94,10 @@ static void walproposer_shmem_request(void);
static shmem_startup_hook_type prev_shmem_startup_hook;
static PagestoreShmemState *pagestore_shared;
static uint64 pagestore_local_counter = 0;
static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
/* This backend's per-shard connections */
typedef struct
{
PGconn *conn;
/*---
* WaitEventSet containing:
* - WL_SOCKET_READABLE on 'conn'
* - WL_LATCH_SET on MyLatch, and
* - WL_EXIT_ON_PM_DEATH.
*/
WaitEventSet *wes;
} PageServer;
static PageServer page_servers[MAX_SHARDS];
static bool pageserver_flush(shardno_t shard_no);
static void pageserver_disconnect(shardno_t shard_no);
static bool pageserver_flush(void);
static void pageserver_disconnect(void);
static bool
PagestoreShmemIsValid(void)
@@ -118,213 +105,89 @@ PagestoreShmemIsValid(void)
return pagestore_shared && UsedShmemSegAddr;
}
/*
* Parse a comma-separated list of connection strings into a ShardMap.
*
* If 'result' is NULL, just checks that the input is valid. If the input is
* not valid, returns false. The contents of *result are undefined in
* that case, and must not be relied on.
*/
static bool
ParseShardMap(const char *connstr, ShardMap *result)
{
const char *p;
int nshards = 0;
if (result)
memset(result, 0, sizeof(ShardMap));
p = connstr;
nshards = 0;
for (;;)
{
const char *sep;
size_t connstr_len;
sep = strchr(p, ',');
connstr_len = sep != NULL ? sep - p : strlen(p);
if (connstr_len == 0 && sep == NULL)
break; /* ignore trailing comma */
if (nshards >= MAX_SHARDS)
{
neon_log(LOG, "Too many shards");
return false;
}
if (connstr_len >= MAX_PAGESERVER_CONNSTRING_SIZE)
{
neon_log(LOG, "Connection string too long");
return false;
}
if (result)
{
memcpy(result->connstring[nshards], p, connstr_len);
result->connstring[nshards][connstr_len] = '\0';
}
nshards++;
if (sep == NULL)
break;
p = sep + 1;
}
if (result)
result->num_shards = nshards;
return true;
}
static bool
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
{
char *p = *newval;
return ParseShardMap(p, NULL);
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
}
static void
AssignPageserverConnstring(const char *newval, void *extra)
{
ShardMap shard_map;
/*
* Only postmaster updates the copy in shared memory.
*/
if (!PagestoreShmemIsValid() || IsUnderPostmaster)
return;
if (!ParseShardMap(newval, &shard_map))
{
/*
* shouldn't happen, because we already checked the value in
* CheckPageserverConnstring
*/
elog(ERROR, "could not parse shard map");
}
if (memcmp(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)) != 0)
{
pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
pg_write_barrier();
memcpy(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap));
pg_write_barrier();
pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
}
else
{
/* no change */
}
pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
pg_write_barrier();
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
pg_write_barrier();
pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
}
static bool
CheckConnstringUpdated(void)
{
if (!PagestoreShmemIsValid())
return false;
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
}
/*
* Get the current number of shards, and/or the connection string for a
* particular shard from the shard map in shared memory.
*
* If num_shards_p is not NULL, it is set to the current number of shards.
*
* If connstr_p is not NULL, the connection string for 'shard_no' is copied to
* it. It must point to a buffer at least MAX_PAGESERVER_CONNSTRING_SIZE bytes
* long.
*
* As a side-effect, if the shard map in shared memory had changed since the
* last call, terminates all existing connections to all pageservers.
*/
static void
load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
ReloadConnstring(void)
{
uint64 begin_update_counter;
uint64 end_update_counter;
ShardMap *shard_map = &pagestore_shared->shard_map;
shardno_t num_shards;
if (!PagestoreShmemIsValid())
return;
/*
* Postmaster can update the shared memory values concurrently, in which
* case we would copy a garbled mix of the old and new values. We will
* detect it because the counter's won't match, and retry. But it's
* important that we don't do anything within the retry-loop that would
* depend on the string having valid contents.
* Copy the current settnig from shared to local memory. Postmaster can
* update the value concurrently, in which case we would copy a garbled
* mix of the old and new values. We will detect it because the counter's
* won't match, and retry. But it's important that we don't do anything
* within the retry-loop that would depend on the string having valid
* contents.
*/
do
{
begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
pg_read_barrier();
num_shards = shard_map->num_shards;
if (connstr_p && shard_no < MAX_SHARDS)
strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE);
pg_memory_barrier();
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
pg_read_barrier();
}
while (begin_update_counter != end_update_counter
|| begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
|| end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
if (connstr_p && shard_no >= num_shards)
neon_log(ERROR, "Shard %d is greater or equal than number of shards %d",
shard_no, num_shards);
/*
* If any of the connection strings changed, reset all connections.
*/
if (pagestore_local_counter != end_update_counter)
{
for (shardno_t i = 0; i < MAX_SHARDS; i++)
{
if (page_servers[i].conn)
pageserver_disconnect(i);
}
pagestore_local_counter = end_update_counter;
}
if (num_shards_p)
*num_shards_p = num_shards;
}
#define MB (1024*1024)
shardno_t
get_shard_number(BufferTag *tag)
{
shardno_t n_shards;
uint32 hash;
load_shard_map(0, NULL, &n_shards);
#if PG_MAJORVERSION_NUM < 16
hash = murmurhash32(tag->rnode.relNode);
hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
#else
hash = murmurhash32(tag->relNumber);
hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
#endif
return hash % n_shards;
pagestore_local_counter = end_update_counter;
}
static bool
pageserver_connect(shardno_t shard_no, int elevel)
pageserver_connect(int elevel)
{
char *query;
int ret;
const char *keywords[3];
const char *values[3];
int n;
PGconn *conn;
WaitEventSet *wes;
char connstr[MAX_PAGESERVER_CONNSTRING_SIZE];
static TimestampTz last_connect_time = 0;
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
TimestampTz now;
uint64_t us_since_last_connect;
Assert(page_servers[shard_no].conn == NULL);
Assert(!connected);
/*
* Get the connection string for this shard. If the shard map has been
* updated since we last looked, this will also disconnect any existing
* pageserver connections as a side effect.
*/
load_shard_map(shard_no, connstr, NULL);
if (CheckConnstringUpdated())
{
ReloadConnstring();
}
now = GetCurrentTimestamp();
us_since_last_connect = now - last_connect_time;
@@ -360,105 +223,50 @@ pageserver_connect(shardno_t shard_no, int elevel)
n++;
}
keywords[n] = "dbname";
values[n] = connstr;
values[n] = local_pageserver_connstring;
n++;
keywords[n] = NULL;
values[n] = NULL;
n++;
conn = PQconnectdbParams(keywords, values, 1);
pageserver_conn = PQconnectdbParams(keywords, values, 1);
if (PQstatus(conn) == CONNECTION_BAD)
if (PQstatus(pageserver_conn) == CONNECTION_BAD)
{
char *msg = pchomp(PQerrorMessage(conn));
char *msg = pchomp(PQerrorMessage(pageserver_conn));
PQfinish(conn);
PQfinish(pageserver_conn);
pageserver_conn = NULL;
ereport(elevel,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
errmsg(NEON_TAG "could not establish connection to pageserver"),
errdetail_internal("%s", msg)));
pfree(msg);
return false;
}
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
ret = PQsendQuery(conn, query);
pfree(query);
ret = PQsendQuery(pageserver_conn, query);
if (ret != 1)
{
PQfinish(conn);
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
PQfinish(pageserver_conn);
pageserver_conn = NULL;
neon_log(elevel, "could not send pagestream command to pageserver");
return false;
}
wes = CreateWaitEventSet(TopMemoryContext, 3);
AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
MyLatch, NULL);
AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
NULL, NULL);
AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
PG_TRY();
{
while (PQisBusy(conn))
{
WaitEvent event;
/* Sleep until there's something to do */
(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
/* Data available in socket? */
if (event.events & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(conn))
{
char *msg = pchomp(PQerrorMessage(conn));
PQfinish(conn);
FreeWaitEventSet(wes);
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
msg);
return false;
}
}
}
}
PG_CATCH();
{
PQfinish(conn);
FreeWaitEventSet(wes);
PG_RE_THROW();
}
PG_END_TRY();
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
page_servers[shard_no].conn = conn;
page_servers[shard_no].wes = wes;
return true;
}
/*
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
*/
static int
call_PQgetCopyData(shardno_t shard_no, char **buffer)
{
int ret;
PGconn *pageserver_conn = page_servers[shard_no].conn;
retry:
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
if (ret == 0)
while (PQisBusy(pageserver_conn))
{
WaitEvent event;
/* Sleep until there's something to do */
(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
@@ -470,7 +278,53 @@ retry:
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg);
PQfinish(pageserver_conn);
pageserver_conn = NULL;
FreeWaitEventSet(pageserver_conn_wes);
pageserver_conn_wes = NULL;
neon_log(elevel, "could not complete handshake with pageserver: %s",
msg);
return false;
}
}
}
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
connected = true;
return true;
}
/*
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
*/
static int
call_PQgetCopyData(char **buffer)
{
int ret;
retry:
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
if (ret == 0)
{
WaitEvent event;
/* Sleep until there's something to do */
(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
/* Data available in socket? */
if (event.events & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(pageserver_conn))
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
neon_log(LOG, "could not get response from pageserver: %s", msg);
pfree(msg);
return -1;
}
@@ -484,7 +338,7 @@ retry:
static void
pageserver_disconnect(shardno_t shard_no)
pageserver_disconnect(void)
{
/*
* If anything goes wrong while we were sending a request, it's not clear
@@ -493,38 +347,38 @@ pageserver_disconnect(shardno_t shard_no)
* time later after we have already sent a new unrelated request. Close
* the connection to avoid getting confused.
*/
if (page_servers[shard_no].conn)
if (connected)
{
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
PQfinish(page_servers[shard_no].conn);
page_servers[shard_no].conn = NULL;
neon_log(LOG, "dropping connection to page server due to error");
PQfinish(pageserver_conn);
pageserver_conn = NULL;
connected = false;
/*
* If the connection to any pageserver is lost, we throw away the
* whole prefetch queue, even for other pageservers. It should not
* cause big problems, because connection loss is supposed to be a
* rare event.
*/
prefetch_on_ps_disconnect();
}
if (page_servers[shard_no].wes != NULL)
if (pageserver_conn_wes != NULL)
{
FreeWaitEventSet(page_servers[shard_no].wes);
page_servers[shard_no].wes = NULL;
FreeWaitEventSet(pageserver_conn_wes);
pageserver_conn_wes = NULL;
}
}
static bool
pageserver_send(shardno_t shard_no, NeonRequest *request)
pageserver_send(NeonRequest *request)
{
StringInfoData req_buff;
PGconn *pageserver_conn = page_servers[shard_no].conn;
if (CheckConnstringUpdated())
{
pageserver_disconnect();
ReloadConnstring();
}
/* If the connection was lost for some reason, reconnect */
if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
{
neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
pageserver_disconnect(shard_no);
neon_log(LOG, "pageserver_send disconnect bad connection");
pageserver_disconnect();
}
req_buff = nm_pack_request(request);
@@ -538,9 +392,9 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
* connection in case of failure.
*/
if (!page_servers[shard_no].conn)
if (!connected)
{
while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
{
HandleMainLoopInterrupts();
n_reconnect_attempts += 1;
@@ -548,8 +402,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
n_reconnect_attempts = 0;
}
pageserver_conn = page_servers[shard_no].conn;
/*
* Send request.
*
@@ -562,8 +414,8 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
pageserver_disconnect();
neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
pfree(msg);
pfree(req_buff.data);
return false;
@@ -575,20 +427,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
{
char *msg = nm_to_string((NeonMessage *) request);
neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
neon_log(PageStoreTrace, "sent request: %s", msg);
pfree(msg);
}
return true;
}
static NeonResponse *
pageserver_receive(shardno_t shard_no)
pageserver_receive(void)
{
StringInfoData resp_buff;
NeonResponse *resp;
PGconn *pageserver_conn = page_servers[shard_no].conn;
if (!pageserver_conn)
if (!connected)
return NULL;
PG_TRY();
@@ -596,7 +447,7 @@ pageserver_receive(shardno_t shard_no)
/* read response */
int rc;
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
rc = call_PQgetCopyData(&resp_buff.data);
if (rc >= 0)
{
resp_buff.len = rc;
@@ -608,33 +459,33 @@ pageserver_receive(shardno_t shard_no)
{
char *msg = nm_to_string((NeonMessage *) resp);
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
neon_log(PageStoreTrace, "got response: %s", msg);
pfree(msg);
}
}
else if (rc == -1)
{
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
pageserver_disconnect(shard_no);
neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
pageserver_disconnect();
resp = NULL;
}
else if (rc == -2)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
pageserver_disconnect();
neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
}
else
{
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
pageserver_disconnect();
neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
}
}
PG_CATCH();
{
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
pageserver_disconnect(shard_no);
neon_log(LOG, "pageserver_receive disconnect due to caught exception");
pageserver_disconnect();
PG_RE_THROW();
}
PG_END_TRY();
@@ -644,13 +495,11 @@ pageserver_receive(shardno_t shard_no)
static bool
pageserver_flush(shardno_t shard_no)
pageserver_flush(void)
{
PGconn *pageserver_conn = page_servers[shard_no].conn;
if (!pageserver_conn)
if (!connected)
{
neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
neon_log(WARNING, "Tried to flush while disconnected");
}
else
{
@@ -658,8 +507,8 @@ pageserver_flush(shardno_t shard_no)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
pageserver_disconnect();
neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
pfree(msg);
return false;
}
@@ -701,7 +550,6 @@ PagestoreShmemInit(void)
{
pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
AssignPageserverConnstring(page_server_connstring, NULL);
}
LWLockRelease(AddinShmemInitLock);
@@ -776,15 +624,6 @@ pg_init_libpagestore(void)
0, /* no flags required */
check_neon_id, NULL, NULL);
DefineCustomIntVariable("neon.stripe_size",
"sharding stripe size",
NULL,
&stripe_size,
32768, 1, INT_MAX,
PGC_SIGHUP,
GUC_UNIT_BLOCKS,
NULL, NULL, NULL);
DefineCustomIntVariable("neon.max_cluster_size",
"cluster size limit",
NULL,

View File

@@ -20,13 +20,9 @@
#include "lib/stringinfo.h"
#include "libpq/pqformat.h"
#include "storage/block.h"
#include "storage/buf_internals.h"
#include "storage/smgr.h"
#include "utils/memutils.h"
#define MAX_SHARDS 128
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
typedef enum
{
/* pagestore_client -> pagestore */
@@ -55,9 +51,6 @@ typedef struct
#define neon_log(tag, fmt, ...) ereport(tag, \
(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \
(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
/*
* supertype of all the Neon*Request structs below
@@ -148,13 +141,11 @@ extern char *nm_to_string(NeonMessage *msg);
* API
*/
typedef unsigned shardno_t;
typedef struct
{
bool (*send) (shardno_t shard_no, NeonRequest * request);
NeonResponse *(*receive) (shardno_t shard_no);
bool (*flush) (shardno_t shard_no);
bool (*send) (NeonRequest *request);
NeonResponse *(*receive) (void);
bool (*flush) (void);
} page_server_api;
extern void prefetch_on_ps_disconnect(void);
@@ -168,8 +159,6 @@ extern char *neon_timeline;
extern char *neon_tenant;
extern int32 max_cluster_size;
extern shardno_t get_shard_number(BufferTag* tag);
extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
extern void smgr_init_neon(void);
extern void readahead_buffer_resize(int newsize, void *extra);

View File

@@ -172,7 +172,6 @@ typedef struct PrefetchRequest
XLogRecPtr actual_request_lsn;
NeonResponse *response; /* may be null */
PrefetchStatus status;
shardno_t shard_no;
uint64 my_ring_index;
} PrefetchRequest;
@@ -240,17 +239,10 @@ typedef struct PrefetchState
* also unused */
/* the buffers */
prfh_hash *prf_hash;
int max_shard_no;
/* Mark shards involved in prefetch */
uint8 shard_bitmap[(MAX_SHARDS + 7)/8];
prfh_hash *prf_hash;
PrefetchRequest prf_buffer[]; /* prefetch buffers */
} PrefetchState;
#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
static PrefetchState *MyPState;
#define GetPrfSlot(ring_index) ( \
@@ -335,7 +327,6 @@ compact_prefetch_buffers(void)
Assert(target_slot->status == PRFS_UNUSED);
target_slot->buftag = source_slot->buftag;
target_slot->shard_no = source_slot->shard_no;
target_slot->status = source_slot->status;
target_slot->response = source_slot->response;
target_slot->effective_request_lsn = source_slot->effective_request_lsn;
@@ -503,23 +494,6 @@ prefetch_cleanup_trailing_unused(void)
}
}
static bool
prefetch_flush_requests(void)
{
for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
{
if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no))
{
if (!page_server->flush(shard_no))
return false;
BITMAP_CLR(MyPState->shard_bitmap, shard_no);
}
}
MyPState->max_shard_no = 0;
return true;
}
/*
* Wait for slot of ring_index to have received its response.
* The caller is responsible for making sure the request buffer is flushed.
@@ -535,7 +509,7 @@ prefetch_wait_for(uint64 ring_index)
if (MyPState->ring_flush <= ring_index &&
MyPState->ring_unused > MyPState->ring_flush)
{
if (!prefetch_flush_requests())
if (!page_server->flush())
return false;
MyPState->ring_flush = MyPState->ring_unused;
}
@@ -573,7 +547,7 @@ prefetch_read(PrefetchRequest *slot)
Assert(slot->my_ring_index == MyPState->ring_receive);
old = MemoryContextSwitchTo(MyPState->errctx);
response = (NeonResponse *) page_server->receive(slot->shard_no);
response = (NeonResponse *) page_server->receive();
MemoryContextSwitchTo(old);
if (response)
{
@@ -730,14 +704,12 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_unused);
while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
while (!page_server->send((NeonRequest *) &request));
/* update prefetch state */
MyPState->n_requests_inflight += 1;
MyPState->n_unused -= 1;
MyPState->ring_unused += 1;
BITMAP_SET(MyPState->shard_bitmap, slot->shard_no);
MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);
/* update slot state */
slot->status = PRFS_REQUESTED;
@@ -908,7 +880,6 @@ Retry:
* function reads the buffer tag from the slot.
*/
slot->buftag = tag;
slot->shard_no = get_shard_number(&tag);
slot->my_ring_index = ring_index;
prefetch_do_request(slot, force_latest, force_lsn);
@@ -919,7 +890,7 @@ Retry:
if (flush_every_n_requests > 0 &&
MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
{
if (!prefetch_flush_requests())
if (!page_server->flush())
{
/*
* Prefetch set is reset in case of error, so we should try to
@@ -937,44 +908,13 @@ static NeonResponse *
page_server_request(void const *req)
{
NeonResponse *resp;
BufferTag tag = {0};
shardno_t shard_no;
switch (((NeonRequest *) req)->tag)
{
case T_NeonExistsRequest:
CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
break;
case T_NeonNblocksRequest:
CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
break;
case T_NeonDbSizeRequest:
NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
break;
case T_NeonGetPageRequest:
CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
break;
default:
neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
}
shard_no = get_shard_number(&tag);
/*
* Current sharding model assumes that all metadata is present only at shard 0.
* We still need to call get_shard_no() to check if shard map is up-to-date.
*/
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
{
shard_no = 0;
}
do
{
while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
while (!page_server->send((NeonRequest *) req) || !page_server->flush());
MyPState->ring_flush = MyPState->ring_unused;
consume_prefetch_responses();
resp = page_server->receive(shard_no);
resp = page_server->receive();
} while (resp == NULL);
return resp;
@@ -2158,8 +2098,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
slot->shard_no, blkno,
errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
blkno,
RelFileInfoFmt(rinfo),
forkNum,
(uint32) (request_lsn >> 32), (uint32) request_lsn),

View File

@@ -3,6 +3,7 @@ mod hacks;
mod link;
pub use link::LinkAuthError;
use smol_str::SmolStr;
use tokio_postgres::config::AuthKeys;
use crate::auth::credentials::check_peer_addr_is_in_list;
@@ -15,6 +16,7 @@ use crate::context::RequestMonitoring;
use crate::proxy::connect_compute::handle_try_wake;
use crate::proxy::retry::retry_after;
use crate::proxy::NeonOptions;
use crate::scram;
use crate::stream::Stream;
use crate::{
auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -26,7 +28,6 @@ use crate::{
},
stream, url,
};
use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
use futures::TryFutureExt;
use std::borrow::Cow;
use std::ops::ControlFlow;
@@ -129,19 +130,19 @@ pub struct ComputeCredentials<T> {
#[derive(Debug, Clone)]
pub struct ComputeUserInfoNoEndpoint {
pub user: RoleName,
pub user: SmolStr,
pub options: NeonOptions,
}
#[derive(Debug, Clone)]
pub struct ComputeUserInfo {
pub endpoint: EndpointId,
pub user: RoleName,
pub endpoint: SmolStr,
pub user: SmolStr,
pub options: NeonOptions,
}
impl ComputeUserInfo {
pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
pub fn endpoint_cache_key(&self) -> SmolStr {
self.options.get_cache_key(&self.endpoint)
}
}
@@ -157,7 +158,7 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
type Error = ComputeUserInfoNoEndpoint;
fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result<Self, Self::Error> {
match user_info.endpoint_id {
match user_info.project {
None => Err(ComputeUserInfoNoEndpoint {
user: user_info.user,
options: user_info.options,
@@ -190,10 +191,7 @@ async fn auth_quirks(
Err(info) => {
let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
.await?;
ctx.set_endpoint_id(res.info.endpoint.clone());
tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
(res.info, Some(res.keys))
}
Ok(info) => (info, None),
@@ -274,12 +272,19 @@ async fn authenticate_with_secret(
classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
}
/// wake a compute (or retrieve an existing compute session from cache)
async fn wake_compute(
/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
/// only if authentication was successfuly.
async fn auth_and_wake_compute(
ctx: &mut RequestMonitoring,
api: &impl console::Api,
compute_credentials: ComputeCredentials<ComputeCredentialKeys>,
user_info: ComputeUserInfoMaybeEndpoint,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
let compute_credentials =
auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?;
let mut num_retries = 0;
let mut node = loop {
let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
@@ -312,11 +317,11 @@ async fn wake_compute(
impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
/// Get compute endpoint name from the credentials.
pub fn get_endpoint(&self) -> Option<EndpointId> {
pub fn get_endpoint(&self) -> Option<SmolStr> {
use BackendType::*;
match self {
Console(_, user_info) => user_info.endpoint_id.clone(),
Console(_, user_info) => user_info.project.clone(),
Link(_) => Some("link".into()),
#[cfg(test)]
Test(_) => Some("test".into()),
@@ -350,20 +355,20 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
Console(api, user_info) => {
info!(
user = &*user_info.user,
project = user_info.endpoint(),
project = user_info.project(),
"performing authentication using the console"
);
let compute_credentials =
auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
let (cache_info, user_info) = wake_compute(ctx, &*api, compute_credentials).await?;
let (cache_info, user_info) =
auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
.await?;
(cache_info, BackendType::Console(api, user_info))
}
// NOTE: this auth backend doesn't use client credentials.
Link(url) => {
info!("performing link authentication");
let node_info = link::authenticate(ctx, &url, client).await?;
let node_info = link::authenticate(&url, client).await?;
(
CachedNodeInfo::new_uncached(node_info),

View File

@@ -1,7 +1,6 @@
use crate::{
auth, compute,
console::{self, provider::NodeInfo},
context::RequestMonitoring,
error::UserFacingError,
stream::PqStream,
waiters,
@@ -55,7 +54,6 @@ pub fn new_psql_session_id() -> String {
}
pub(super) async fn authenticate(
ctx: &mut RequestMonitoring,
link_uri: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<NodeInfo> {
@@ -96,10 +94,6 @@ pub(super) async fn authenticate(
.dbname(&db_info.dbname)
.user(&db_info.user);
ctx.set_user(db_info.user.into());
ctx.set_project(db_info.aux.clone());
tracing::Span::current().record("ep", &tracing::field::display(&db_info.aux.endpoint_id));
// Backwards compatibility. pg_sni_proxy uses "--" in domain names
// while direct connections do not. Once we migrate to pg_sni_proxy
// everywhere, we can remove this.

View File

@@ -2,8 +2,7 @@
use crate::{
auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
EndpointId, RoleName,
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions,
};
use itertools::Itertools;
use pq_proto::StartupMessageParams;
@@ -22,10 +21,7 @@ pub enum ComputeUserInfoParseError {
SNI ('{}') and project option ('{}').",
.domain, .option,
)]
InconsistentProjectNames {
domain: EndpointId,
option: EndpointId,
},
InconsistentProjectNames { domain: SmolStr, option: SmolStr },
#[error(
"Common name inferred from SNI ('{}') is not known",
@@ -34,7 +30,7 @@ pub enum ComputeUserInfoParseError {
UnknownCommonName { cn: String },
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
MalformedProjectName(EndpointId),
MalformedProjectName(SmolStr),
}
impl UserFacingError for ComputeUserInfoParseError {}
@@ -43,22 +39,24 @@ impl UserFacingError for ComputeUserInfoParseError {}
/// Note that we don't store any kind of client key or password here.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ComputeUserInfoMaybeEndpoint {
pub user: RoleName,
pub endpoint_id: Option<EndpointId>,
pub user: SmolStr,
// TODO: this is a severe misnomer! We should think of a new name ASAP.
pub project: Option<SmolStr>,
pub options: NeonOptions,
}
impl ComputeUserInfoMaybeEndpoint {
#[inline]
pub fn endpoint(&self) -> Option<&str> {
self.endpoint_id.as_deref()
pub fn project(&self) -> Option<&str> {
self.project.as_deref()
}
}
pub fn endpoint_sni(
sni: &str,
pub fn endpoint_sni<'a>(
sni: &'a str,
common_names: &HashSet<String>,
) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
) -> Result<&'a str, ComputeUserInfoParseError> {
let Some((subdomain, common_name)) = sni.split_once('.') else {
return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
};
@@ -67,10 +65,7 @@ pub fn endpoint_sni(
cn: common_name.into(),
});
}
if subdomain == SERVERLESS_DRIVER_SNI {
return Ok(None);
}
Ok(Some(EndpointId::from(subdomain)))
Ok(subdomain)
}
impl ComputeUserInfoMaybeEndpoint {
@@ -84,14 +79,15 @@ impl ComputeUserInfoMaybeEndpoint {
// Some parameters are stored in the startup message.
let get_param = |key| params.get(key).ok_or(MissingKey(key));
let user: RoleName = get_param("user")?.into();
let user: SmolStr = get_param("user")?.into();
// record the values if we have them
ctx.set_application(params.get("application_name").map(SmolStr::from));
ctx.set_user(user.clone());
ctx.set_endpoint_id(sni.map(SmolStr::from));
// Project name might be passed via PG's command-line options.
let endpoint_option = params
let project_option = params
.options_raw()
.and_then(|options| {
// We support both `project` (deprecated) and `endpoint` options for backward compatibility.
@@ -104,9 +100,9 @@ impl ComputeUserInfoMaybeEndpoint {
})
.map(|name| name.into());
let endpoint_from_domain = if let Some(sni_str) = sni {
let project_from_domain = if let Some(sni_str) = sni {
if let Some(cn) = common_names {
endpoint_sni(sni_str, cn)?
Some(SmolStr::from(endpoint_sni(sni_str, cn)?))
} else {
None
}
@@ -114,31 +110,26 @@ impl ComputeUserInfoMaybeEndpoint {
None
};
let endpoint = match (endpoint_option, endpoint_from_domain) {
let project = match (project_option, project_from_domain) {
// Invariant: if we have both project name variants, they should match.
(Some(option), Some(domain)) if option != domain => {
Some(Err(InconsistentProjectNames { domain, option }))
}
// Invariant: project name may not contain certain characters.
(a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
(a, b) => a.or(b).map(|name| match project_name_valid(&name) {
false => Err(MalformedProjectName(name)),
true => Ok(name),
}),
}
.transpose()?;
if let Some(ep) = &endpoint {
ctx.set_endpoint_id(ep.clone());
tracing::Span::current().record("ep", &tracing::field::display(ep));
}
info!(%user, project = endpoint.as_deref(), "credentials");
info!(%user, project = project.as_deref(), "credentials");
if sni.is_some() {
info!("Connection with sni");
NUM_CONNECTION_ACCEPTED_BY_SNI
.with_label_values(&["sni"])
.inc();
} else if endpoint.is_some() {
} else if project.is_some() {
NUM_CONNECTION_ACCEPTED_BY_SNI
.with_label_values(&["no_sni"])
.inc();
@@ -154,7 +145,7 @@ impl ComputeUserInfoMaybeEndpoint {
Ok(Self {
user,
endpoint_id: endpoint,
project,
options,
})
}
@@ -247,7 +238,7 @@ mod tests {
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id, None);
assert_eq!(user_info.project, None);
Ok(())
}
@@ -262,7 +253,7 @@ mod tests {
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id, None);
assert_eq!(user_info.project, None);
Ok(())
}
@@ -278,7 +269,7 @@ mod tests {
let user_info =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
assert_eq!(user_info.project.as_deref(), Some("foo"));
assert_eq!(user_info.options.get_cache_key("foo"), "foo");
Ok(())
@@ -294,7 +285,7 @@ mod tests {
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
assert_eq!(user_info.project.as_deref(), Some("bar"));
Ok(())
}
@@ -309,7 +300,7 @@ mod tests {
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
assert_eq!(user_info.project.as_deref(), Some("bar"));
Ok(())
}
@@ -327,7 +318,7 @@ mod tests {
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert!(user_info.endpoint_id.is_none());
assert!(user_info.project.is_none());
Ok(())
}
@@ -342,7 +333,7 @@ mod tests {
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert!(user_info.endpoint_id.is_none());
assert!(user_info.project.is_none());
Ok(())
}
@@ -358,7 +349,7 @@ mod tests {
let user_info =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));
assert_eq!(user_info.project.as_deref(), Some("baz"));
Ok(())
}
@@ -372,14 +363,14 @@ mod tests {
let mut ctx = RequestMonitoring::test();
let user_info =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
assert_eq!(user_info.project.as_deref(), Some("p1"));
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.b.com");
let mut ctx = RequestMonitoring::test();
let user_info =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
assert_eq!(user_info.project.as_deref(), Some("p1"));
Ok(())
}
@@ -436,7 +427,7 @@ mod tests {
let mut ctx = RequestMonitoring::test();
let user_info =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
assert_eq!(user_info.project.as_deref(), Some("project"));
assert_eq!(
user_info.options.get_cache_key("project"),
"project endpoint_type:read_write lsn:0/2"

View File

@@ -4,11 +4,10 @@
//! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified.
use bstr::ByteSlice;
use crate::EndpointId;
use smol_str::SmolStr;
pub struct PasswordHackPayload {
pub endpoint: EndpointId,
pub endpoint: SmolStr,
pub password: Vec<u8>,
}

View File

@@ -272,5 +272,5 @@ async fn handle_client(
let client = tokio::net::TcpStream::connect(destination).await?;
let metrics_aux: MetricsAuxInfo = Default::default();
proxy::proxy::passthrough::proxy_pass(ctx, tls_stream, client, metrics_aux).await
proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
}

View File

@@ -11,16 +11,13 @@ use smol_str::SmolStr;
use tokio::time::Instant;
use tracing::{debug, info};
use crate::{
auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId,
RoleName,
};
use crate::{auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret};
use super::{Cache, Cached};
pub trait ProjectInfoCache {
fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId);
fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName);
fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr);
fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr);
fn enable_ttl(&self);
fn disable_ttl(&self);
}
@@ -47,7 +44,7 @@ impl<T> From<T> for Entry<T> {
#[derive(Default)]
struct EndpointInfo {
secret: std::collections::HashMap<RoleName, Entry<Option<AuthSecret>>>,
secret: std::collections::HashMap<SmolStr, Entry<Option<AuthSecret>>>,
allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
}
@@ -60,7 +57,7 @@ impl EndpointInfo {
}
pub fn get_role_secret(
&self,
role_name: &RoleName,
role_name: &SmolStr,
valid_since: Instant,
ignore_cache_since: Option<Instant>,
) -> Option<(Option<AuthSecret>, bool)> {
@@ -93,7 +90,7 @@ impl EndpointInfo {
pub fn invalidate_allowed_ips(&mut self) {
self.allowed_ips = None;
}
pub fn invalidate_role_secret(&mut self, role_name: &RoleName) {
pub fn invalidate_role_secret(&mut self, role_name: &SmolStr) {
self.secret.remove(role_name);
}
}
@@ -106,9 +103,9 @@ impl EndpointInfo {
/// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
/// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
pub struct ProjectInfoCacheImpl {
cache: DashMap<EndpointId, EndpointInfo>,
cache: DashMap<SmolStr, EndpointInfo>,
project2ep: DashMap<ProjectId, HashSet<EndpointId>>,
project2ep: DashMap<SmolStr, HashSet<SmolStr>>,
config: ProjectInfoCacheOptions,
start_time: Instant,
@@ -116,7 +113,7 @@ pub struct ProjectInfoCacheImpl {
}
impl ProjectInfoCache for ProjectInfoCacheImpl {
fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) {
fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr) {
info!("invalidating allowed ips for project `{}`", project_id);
let endpoints = self
.project2ep
@@ -129,7 +126,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
}
}
}
fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) {
fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr) {
info!(
"invalidating role secret for project_id `{}` and role_name `{}`",
project_id, role_name
@@ -170,8 +167,8 @@ impl ProjectInfoCacheImpl {
pub fn get_role_secret(
&self,
endpoint_id: &EndpointId,
role_name: &RoleName,
endpoint_id: &SmolStr,
role_name: &SmolStr,
) -> Option<Cached<&Self, Option<AuthSecret>>> {
let (valid_since, ignore_cache_since) = self.get_cache_times();
let endpoint_info = self.cache.get(endpoint_id)?;
@@ -191,7 +188,7 @@ impl ProjectInfoCacheImpl {
}
pub fn get_allowed_ips(
&self,
endpoint_id: &EndpointId,
endpoint_id: &SmolStr,
) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
let (valid_since, ignore_cache_since) = self.get_cache_times();
let endpoint_info = self.cache.get(endpoint_id)?;
@@ -208,9 +205,9 @@ impl ProjectInfoCacheImpl {
}
pub fn insert_role_secret(
&self,
project_id: &ProjectId,
endpoint_id: &EndpointId,
role_name: &RoleName,
project_id: &SmolStr,
endpoint_id: &SmolStr,
role_name: &SmolStr,
secret: Option<AuthSecret>,
) {
if self.cache.len() >= self.config.size {
@@ -225,8 +222,8 @@ impl ProjectInfoCacheImpl {
}
pub fn insert_allowed_ips(
&self,
project_id: &ProjectId,
endpoint_id: &EndpointId,
project_id: &SmolStr,
endpoint_id: &SmolStr,
allowed_ips: Arc<Vec<IpPattern>>,
) {
if self.cache.len() >= self.config.size {
@@ -239,7 +236,7 @@ impl ProjectInfoCacheImpl {
.or_default()
.allowed_ips = Some(allowed_ips.into());
}
fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) {
fn inser_project2endpoint(&self, project_id: &SmolStr, endpoint_id: &SmolStr) {
if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
endpoints.insert(endpoint_id.clone());
} else {
@@ -300,18 +297,18 @@ impl ProjectInfoCacheImpl {
/// This is used to invalidate cache entries.
pub struct CachedLookupInfo {
/// Search by this key.
endpoint_id: EndpointId,
endpoint_id: SmolStr,
lookup_type: LookupType,
}
impl CachedLookupInfo {
pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self {
pub(self) fn new_role_secret(endpoint_id: SmolStr, role_name: SmolStr) -> Self {
Self {
endpoint_id,
lookup_type: LookupType::RoleSecret(role_name),
}
}
pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self {
pub(self) fn new_allowed_ips(endpoint_id: SmolStr) -> Self {
Self {
endpoint_id,
lookup_type: LookupType::AllowedIps,
@@ -320,7 +317,7 @@ impl CachedLookupInfo {
}
enum LookupType {
RoleSecret(RoleName),
RoleSecret(SmolStr),
AllowedIps,
}
@@ -351,6 +348,7 @@ impl Cache for ProjectInfoCacheImpl {
mod tests {
use super::*;
use crate::{console::AuthSecret, scram::ServerSecret};
use smol_str::SmolStr;
use std::{sync::Arc, time::Duration};
#[tokio::test]
@@ -364,8 +362,8 @@ mod tests {
});
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into();
let user1: SmolStr = "user1".into();
let user2: SmolStr = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
user1.as_str(),
[1; 32],
@@ -387,7 +385,7 @@ mod tests {
assert_eq!(cached.value, secret2);
// Shouldn't add more than 2 roles.
let user3: RoleName = "user3".into();
let user3: SmolStr = "user3".into();
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
user3.as_str(),
[3; 32],
@@ -422,8 +420,8 @@ mod tests {
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into();
let user1: SmolStr = "user1".into();
let user2: SmolStr = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
user1.as_str(),
[1; 32],
@@ -477,8 +475,8 @@ mod tests {
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into();
let user1: SmolStr = "user1".into();
let user2: SmolStr = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
user1.as_str(),
[1; 32],

View File

@@ -1,7 +1,7 @@
use anyhow::Context;
use anyhow::{bail, Context};
use dashmap::DashMap;
use pq_proto::CancelKeyData;
use std::{net::SocketAddr, sync::Arc};
use std::net::SocketAddr;
use tokio::net::TcpStream;
use tokio_postgres::{CancelToken, NoTls};
use tracing::info;
@@ -25,31 +25,39 @@ impl CancelMap {
}
/// Run async action within an ephemeral session identified by [`CancelKeyData`].
pub fn get_session(self: Arc<Self>) -> Session {
pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result<V>
where
F: FnOnce(Session<'a>) -> R,
R: std::future::Future<Output = anyhow::Result<V>>,
{
// HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
// expose it and we don't want to do another roundtrip to query
// for it. The client will be able to notice that this is not the
// actual backend_pid, but backend_pid is not used for anything
// so it doesn't matter.
let key = loop {
let key = rand::random();
let key = rand::random();
// Random key collisions are unlikely to happen here, but they're still possible,
// which is why we have to take care not to rewrite an existing key.
match self.0.entry(key) {
dashmap::mapref::entry::Entry::Occupied(_) => continue,
dashmap::mapref::entry::Entry::Vacant(e) => {
e.insert(None);
}
// Random key collisions are unlikely to happen here, but they're still possible,
// which is why we have to take care not to rewrite an existing key.
match self.0.entry(key) {
dashmap::mapref::entry::Entry::Occupied(_) => {
bail!("query cancellation key already exists: {key}")
}
break key;
};
dashmap::mapref::entry::Entry::Vacant(e) => {
e.insert(None);
}
}
// This will guarantee that the session gets dropped
// as soon as the future is finished.
scopeguard::defer! {
self.0.remove(&key);
info!("dropped query cancellation key {key}");
}
info!("registered new query cancellation key {key}");
Session {
key,
cancel_map: self,
}
let session = Session::new(key, self);
f(session).await
}
#[cfg(test)]
@@ -90,17 +98,23 @@ impl CancelClosure {
}
/// Helper for registering query cancellation tokens.
pub struct Session {
pub struct Session<'a> {
/// The user-facing key identifying this session.
key: CancelKeyData,
/// The [`CancelMap`] this session belongs to.
cancel_map: Arc<CancelMap>,
cancel_map: &'a CancelMap,
}
impl Session {
impl<'a> Session<'a> {
fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self {
Self { key, cancel_map }
}
}
impl Session<'_> {
/// Store the cancel token for the given session.
/// This enables query cancellation in `crate::proxy::prepare_client_connection`.
pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
info!("enabling query cancellation for this session");
self.cancel_map.0.insert(self.key, Some(cancel_closure));
@@ -108,26 +122,37 @@ impl Session {
}
}
impl Drop for Session {
fn drop(&mut self) {
self.cancel_map.0.remove(&self.key);
info!("dropped query cancellation key {}", &self.key);
}
}
#[cfg(test)]
mod tests {
use super::*;
use once_cell::sync::Lazy;
#[tokio::test]
async fn check_session_drop() -> anyhow::Result<()> {
let cancel_map: Arc<CancelMap> = Default::default();
static CANCEL_MAP: Lazy<CancelMap> = Lazy::new(Default::default);
let (tx, rx) = tokio::sync::oneshot::channel();
let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move {
assert!(CANCEL_MAP.contains(&session));
tx.send(()).expect("failed to send");
futures::future::pending::<()>().await; // sleep forever
Ok(())
}));
// Wait until the task has been spawned.
rx.await.context("failed to hear from the task")?;
// Drop the session's entry by cancelling the task.
task.abort();
let error = task.await.expect_err("task should have failed");
if !error.is_cancelled() {
anyhow::bail!(error);
}
let session = cancel_map.clone().get_session();
assert!(cancel_map.contains(&session));
drop(session);
// Check that the session has been dropped.
assert!(cancel_map.is_empty());
assert!(CANCEL_MAP.is_empty());
Ok(())
}

View File

@@ -1,10 +1,9 @@
use serde::Deserialize;
use smol_str::SmolStr;
use std::fmt;
use crate::auth::IpPattern;
use crate::{BranchId, EndpointId, ProjectId};
/// Generic error response with human-readable description.
/// Note that we can't always present it to user as is.
#[derive(Debug, Deserialize)]
@@ -18,7 +17,7 @@ pub struct ConsoleError {
pub struct GetRoleSecret {
pub role_secret: Box<str>,
pub allowed_ips: Option<Vec<IpPattern>>,
pub project_id: Option<ProjectId>,
pub project_id: Option<Box<str>>,
}
// Manually implement debug to omit sensitive info.
@@ -95,9 +94,9 @@ impl fmt::Debug for DatabaseInfo {
/// Also known as `ProxyMetricsAuxInfo` in the console.
#[derive(Debug, Deserialize, Clone, Default)]
pub struct MetricsAuxInfo {
pub endpoint_id: EndpointId,
pub project_id: ProjectId,
pub branch_id: BranchId,
pub endpoint_id: SmolStr,
pub project_id: SmolStr,
pub branch_id: SmolStr,
}
impl MetricsAuxInfo {

View File

@@ -9,10 +9,11 @@ use crate::{
compute,
config::{CacheOptions, ProjectInfoCacheOptions},
context::RequestMonitoring,
scram, EndpointCacheKey, ProjectId,
scram,
};
use async_trait::async_trait;
use dashmap::DashMap;
use smol_str::SmolStr;
use std::{sync::Arc, time::Duration};
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
use tokio::time::Instant;
@@ -213,7 +214,7 @@ pub struct AuthInfo {
/// List of IP addresses allowed for the autorization.
pub allowed_ips: Vec<IpPattern>,
/// Project ID. This is used for cache invalidation.
pub project_id: Option<ProjectId>,
pub project_id: Option<SmolStr>,
}
/// Info for establishing a connection to a compute node.
@@ -232,7 +233,7 @@ pub struct NodeInfo {
pub allow_self_signed_compute: bool,
}
pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
pub type NodeInfoCache = TimedLru<SmolStr, NodeInfo>;
pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
@@ -344,7 +345,7 @@ impl ApiCaches {
/// Various caches for [`console`](super).
pub struct ApiLocks {
name: &'static str,
node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
node_locks: DashMap<SmolStr, Arc<Semaphore>>,
permits: usize,
timeout: Duration,
registered: prometheus::IntCounter,
@@ -412,7 +413,7 @@ impl ApiLocks {
pub async fn get_wake_compute_permit(
&self,
key: &EndpointCacheKey,
key: &SmolStr,
) -> Result<WakeComputePermit, errors::WakeComputeError> {
if self.permits == 0 {
return Ok(WakeComputePermit { permit: None });

View File

@@ -14,6 +14,7 @@ use crate::{
};
use async_trait::async_trait;
use futures::TryFutureExt;
use smol_str::SmolStr;
use std::sync::Arc;
use tokio::time::Instant;
use tokio_postgres::config::SslMode;
@@ -97,7 +98,7 @@ impl Api {
Ok(AuthInfo {
secret,
allowed_ips,
project_id: body.project_id,
project_id: body.project_id.map(SmolStr::from),
})
}
.map_err(crate::error::log_error)
@@ -238,7 +239,7 @@ impl super::Api for Api {
// for some time (highly depends on the console's scale-to-zero policy);
// The connection info remains the same during that period of time,
// which means that we might cache it to reduce the load and latency.
if let Some(cached) = self.caches.node_info.get(&key) {
if let Some(cached) = self.caches.node_info.get(&*key) {
info!(key = &*key, "found cached compute node info");
return Ok(cached);
}

View File

@@ -7,10 +7,7 @@ use std::net::IpAddr;
use tokio::sync::mpsc;
use uuid::Uuid;
use crate::{
console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId,
EndpointId, ProjectId, RoleName,
};
use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer};
pub mod parquet;
@@ -29,10 +26,10 @@ pub struct RequestMonitoring {
region: &'static str,
// filled in as they are discovered
project: Option<ProjectId>,
branch: Option<BranchId>,
endpoint_id: Option<EndpointId>,
user: Option<RoleName>,
project: Option<SmolStr>,
branch: Option<SmolStr>,
endpoint_id: Option<SmolStr>,
user: Option<SmolStr>,
application: Option<SmolStr>,
error_kind: Option<ErrorKind>,
success: bool,
@@ -89,18 +86,15 @@ impl RequestMonitoring {
self.project = Some(x.project_id);
}
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
crate::metrics::CONNECTING_ENDPOINTS
.with_label_values(&[self.protocol])
.measure(&endpoint_id);
self.endpoint_id = Some(endpoint_id);
pub fn set_endpoint_id(&mut self, endpoint_id: Option<SmolStr>) {
self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
}
pub fn set_application(&mut self, app: Option<SmolStr>) {
self.application = app.or_else(|| self.application.clone());
}
pub fn set_user(&mut self, user: RoleName) {
pub fn set_user(&mut self, user: SmolStr) {
self.user = Some(user);
}

View File

@@ -62,79 +62,3 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
r.context("join error").and_then(|x| x)
}
macro_rules! smol_str_wrapper {
($name:ident) => {
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
pub struct $name(smol_str::SmolStr);
impl $name {
pub fn as_str(&self) -> &str {
self.0.as_str()
}
}
impl std::fmt::Display for $name {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
impl<T> std::cmp::PartialEq<T> for $name
where
smol_str::SmolStr: std::cmp::PartialEq<T>,
{
fn eq(&self, other: &T) -> bool {
self.0.eq(other)
}
}
impl<T> From<T> for $name
where
smol_str::SmolStr: From<T>,
{
fn from(x: T) -> Self {
Self(x.into())
}
}
impl AsRef<str> for $name {
fn as_ref(&self) -> &str {
self.0.as_ref()
}
}
impl std::ops::Deref for $name {
type Target = str;
fn deref(&self) -> &str {
&*self.0
}
}
impl<'de> serde::de::Deserialize<'de> for $name {
fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
<smol_str::SmolStr as serde::de::Deserialize<'de>>::deserialize(d).map(Self)
}
}
impl serde::Serialize for $name {
fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
self.0.serialize(s)
}
}
};
}
// 90% of role name strings are 20 characters or less.
smol_str_wrapper!(RoleName);
// 50% of endpoint strings are 23 characters or less.
smol_str_wrapper!(EndpointId);
// 50% of branch strings are 23 characters or less.
smol_str_wrapper!(BranchId);
// 90% of project strings are 23 characters or less.
smol_str_wrapper!(ProjectId);
// will usually equal endpoint ID
smol_str_wrapper!(EndpointCacheKey);
smol_str_wrapper!(DbName);

View File

@@ -1,7 +1,10 @@
use ::metrics::{
exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge_vec, Histogram,
HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGaugeVec,
exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
IntCounterPairVec, IntCounterVec,
};
use prometheus::{
register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
IntGaugeVec,
};
use once_cell::sync::Lazy;
@@ -233,13 +236,3 @@ pub const fn bool_to_str(x: bool) -> &'static str {
"false"
}
}
pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
register_hll_vec!(
32,
"proxy_connecting_endpoints",
"HLL approximate cardinality of endpoints that are connecting",
&["protocol"],
)
.unwrap()
});

View File

@@ -2,34 +2,36 @@
mod tests;
pub mod connect_compute;
pub mod handshake;
pub mod passthrough;
pub mod retry;
use crate::{
auth,
cancellation::{self, CancelMap},
compute,
config::{ProxyConfig, TlsConfig},
config::{AuthenticationConfig, ProxyConfig, TlsConfig},
console::messages::MetricsAuxInfo,
context::RequestMonitoring,
metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
metrics::{
NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
},
protocol2::WithClientIp,
proxy::{handshake::handshake, passthrough::proxy_pass},
rate_limiter::EndpointRateLimiter,
stream::{PqStream, Stream},
EndpointCacheKey,
usage_metrics::{Ids, USAGE_METRICS},
};
use anyhow::{bail, Context};
use futures::TryFutureExt;
use itertools::Itertools;
use once_cell::sync::OnceCell;
use pq_proto::{BeMessage as Be, StartupMessageParams};
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
use regex::Regex;
use smol_str::{format_smolstr, SmolStr};
use smol_str::SmolStr;
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, Instrument};
use utils::measured_stream::MeasuredStream;
use self::connect_compute::{connect_to_compute, TcpMechanism};
@@ -77,13 +79,6 @@ pub async fn task_main(
let cancel_map = Arc::clone(&cancel_map);
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
let session_span = info_span!(
"handle_client",
?session_id,
peer_addr = tracing::field::Empty,
ep = tracing::field::Empty,
);
connections.spawn(
async move {
info!("accepted postgres client connection");
@@ -107,18 +102,22 @@ pub async fn task_main(
handle_client(
config,
&mut ctx,
cancel_map,
&cancel_map,
socket,
ClientMode::Tcp,
endpoint_rate_limiter,
)
.await
}
.instrument(info_span!(
"handle_client",
?session_id,
peer_addr = tracing::field::Empty
))
.unwrap_or_else(move |e| {
// Acknowledge that the task has finished with an error.
error!("per-client task finished with an error: {e:#}");
})
.instrument(session_span),
error!(?session_id, "per-client task finished with an error: {e:#}");
}),
);
}
@@ -171,7 +170,7 @@ impl ClientMode {
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
config: &'static ProxyConfig,
ctx: &mut RequestMonitoring,
cancel_map: Arc<CancelMap>,
cancel_map: &CancelMap,
stream: S,
mode: ClientMode,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -192,88 +191,138 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let tls = config.tls_config.as_ref();
let pause = ctx.latency_timer.pause();
let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
let (mut stream, params) = match do_handshake.await? {
Some(x) => x,
None => return Ok(()), // it's a cancellation request
};
drop(pause);
let hostname = mode.hostname(stream.get_ref());
let common_names = tls.map(|tls| &tls.common_names);
// Extract credentials which we're going to use for auth.
let result = config
.auth_backend
.as_ref()
.map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
.transpose();
let user_info = {
let hostname = mode.hostname(stream.get_ref());
let user_info = match result {
Ok(user_info) => user_info,
Err(e) => stream.throw_error(e).await?,
let common_names = tls.map(|tls| &tls.common_names);
let result = config
.auth_backend
.as_ref()
.map(|_| {
auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names)
})
.transpose();
match result {
Ok(user_info) => user_info,
Err(e) => stream.throw_error(e).await?,
}
};
// check rate limit
if let Some(ep) = user_info.get_endpoint() {
if !endpoint_rate_limiter.check(ep) {
return stream
.throw_error(auth::AuthError::too_many_connections())
.await;
ctx.set_endpoint_id(user_info.get_endpoint());
let client = Client::new(
stream,
user_info,
&params,
mode.allow_self_signed_compute(config),
endpoint_rate_limiter,
);
cancel_map
.with_session(|session| {
client.connect_to_db(ctx, session, mode, &config.authentication_config)
})
.await
}
/// Establish a (most probably, secure) connection with the client.
/// For better testing experience, `stream` can be any object satisfying the traits.
/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
/// we also take an extra care of propagating only the select handshake errors to client.
#[tracing::instrument(skip_all)]
async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
stream: S,
mut tls: Option<&TlsConfig>,
cancel_map: &CancelMap,
) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
// Client may try upgrading to each protocol only once
let (mut tried_ssl, mut tried_gss) = (false, false);
let mut stream = PqStream::new(Stream::from_raw(stream));
loop {
let msg = stream.read_startup_packet().await?;
info!("received {msg:?}");
use FeStartupPacket::*;
match msg {
SslRequest => match stream.get_ref() {
Stream::Raw { .. } if !tried_ssl => {
tried_ssl = true;
// We can't perform TLS handshake without a config
let enc = tls.is_some();
stream.write_message(&Be::EncryptionResponse(enc)).await?;
if let Some(tls) = tls.take() {
// Upgrade raw stream into a secure TLS-backed stream.
// NOTE: We've consumed `tls`; this fact will be used later.
let (raw, read_buf) = stream.into_inner();
// TODO: Normally, client doesn't send any data before
// server says TLS handshake is ok and read_buf is empy.
// However, you could imagine pipelining of postgres
// SSLRequest + TLS ClientHello in one hunk similar to
// pipelining in our node js driver. We should probably
// support that by chaining read_buf with the stream.
if !read_buf.is_empty() {
bail!("data is sent before server replied with EncryptionResponse");
}
let tls_stream = raw.upgrade(tls.to_server_config()).await?;
let (_, tls_server_end_point) = tls
.cert_resolver
.resolve(tls_stream.get_ref().1.server_name())
.context("missing certificate")?;
stream = PqStream::new(Stream::Tls {
tls: Box::new(tls_stream),
tls_server_end_point,
});
}
}
_ => bail!(ERR_PROTO_VIOLATION),
},
GssEncRequest => match stream.get_ref() {
Stream::Raw { .. } if !tried_gss => {
tried_gss = true;
// Currently, we don't support GSSAPI
stream.write_message(&Be::EncryptionResponse(false)).await?;
}
_ => bail!(ERR_PROTO_VIOLATION),
},
StartupMessage { params, .. } => {
// Check that the config has been consumed during upgrade
// OR we didn't provide it at all (for dev purposes).
if tls.is_some() {
stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
}
info!(session_type = "normal", "successful handshake");
break Ok(Some((stream, params)));
}
CancelRequest(cancel_key_data) => {
cancel_map.cancel_session(cancel_key_data).await?;
info!(session_type = "cancellation", "successful handshake");
break Ok(None);
}
}
}
let user = user_info.get_user().to_owned();
let (mut node_info, user_info) = match user_info
.authenticate(
ctx,
&mut stream,
mode.allow_cleartext(),
&config.authentication_config,
)
.await
{
Ok(auth_result) => auth_result,
Err(e) => {
let db = params.get("database");
let app = params.get("application_name");
let params_span = tracing::info_span!("", ?user, ?db, ?app);
return stream.throw_error(e).instrument(params_span).await;
}
};
node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);
let aux = node_info.aux.clone();
let mut node = connect_to_compute(
ctx,
&TcpMechanism { params: &params },
node_info,
&user_info,
)
.or_else(|e| stream.throw_error(e))
.await?;
let session = cancel_map.get_session();
prepare_client_connection(&node, &session, &mut stream).await?;
// Before proxy passing, forward to compute whatever data is left in the
// PqStream input buffer. Normally there is none, but our serverless npm
// driver in pipeline mode sends startup, password and first query
// immediately after opening the connection.
let (stream, read_buf) = stream.into_inner();
node.stream.write_all(&read_buf).await?;
proxy_pass(ctx, stream, node.stream, aux).await
}
/// Finish client connection initialization: confirm auth success, send params, etc.
#[tracing::instrument(skip_all)]
async fn prepare_client_connection(
node: &compute::PostgresConnection,
session: &cancellation::Session,
session: cancellation::Session<'_>,
stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> anyhow::Result<()> {
// Register compute's query cancellation token and produce a new, unique one.
@@ -299,6 +348,151 @@ async fn prepare_client_connection(
Ok(())
}
/// Forward bytes in both directions (client <-> compute).
#[tracing::instrument(skip_all)]
pub async fn proxy_pass(
ctx: &mut RequestMonitoring,
client: impl AsyncRead + AsyncWrite + Unpin,
compute: impl AsyncRead + AsyncWrite + Unpin,
aux: MetricsAuxInfo,
) -> anyhow::Result<()> {
ctx.set_success();
ctx.log();
let usage = USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.clone(),
branch_id: aux.branch_id.clone(),
});
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
let mut client = MeasuredStream::new(
client,
|_| {},
|cnt| {
// Number of bytes we sent to the client (outbound).
m_sent.inc_by(cnt as u64);
m_sent2.inc_by(cnt as u64);
usage.record_egress(cnt as u64);
},
);
let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
let mut compute = MeasuredStream::new(
compute,
|_| {},
|cnt| {
// Number of bytes the client sent to the compute node (inbound).
m_recv.inc_by(cnt as u64);
m_recv2.inc_by(cnt as u64);
},
);
// Starting from here we only proxy the client's traffic.
info!("performing the proxy pass...");
let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
Ok(())
}
/// Thin connection context.
struct Client<'a, S> {
/// The underlying libpq protocol stream.
stream: PqStream<Stream<S>>,
/// Client credentials that we care about.
user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
/// KV-dictionary with PostgreSQL connection params.
params: &'a StartupMessageParams,
/// Allow self-signed certificates (for testing).
allow_self_signed_compute: bool,
/// Rate limiter for endpoints
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
}
impl<'a, S> Client<'a, S> {
/// Construct a new connection context.
fn new(
stream: PqStream<Stream<S>>,
user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
params: &'a StartupMessageParams,
allow_self_signed_compute: bool,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> Self {
Self {
stream,
user_info,
params,
allow_self_signed_compute,
endpoint_rate_limiter,
}
}
}
impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
/// Let the client authenticate and connect to the designated compute node.
// Instrumentation logs endpoint name everywhere. Doesn't work for link
// auth; strictly speaking we don't know endpoint name in its case.
#[tracing::instrument(name = "", fields(ep = %self.user_info.get_endpoint().unwrap_or_default()), skip_all)]
async fn connect_to_db(
self,
ctx: &mut RequestMonitoring,
session: cancellation::Session<'_>,
mode: ClientMode,
config: &'static AuthenticationConfig,
) -> anyhow::Result<()> {
let Self {
mut stream,
user_info,
params,
allow_self_signed_compute,
endpoint_rate_limiter,
} = self;
// check rate limit
if let Some(ep) = user_info.get_endpoint() {
if !endpoint_rate_limiter.check(ep) {
return stream
.throw_error(auth::AuthError::too_many_connections())
.await;
}
}
let user = user_info.get_user().to_owned();
let auth_result = match user_info
.authenticate(ctx, &mut stream, mode.allow_cleartext(), config)
.await
{
Ok(auth_result) => auth_result,
Err(e) => {
let db = params.get("database");
let app = params.get("application_name");
let params_span = tracing::info_span!("", ?user, ?db, ?app);
return stream.throw_error(e).instrument(params_span).await;
}
};
let (mut node_info, user_info) = auth_result;
node_info.allow_self_signed_compute = allow_self_signed_compute;
let aux = node_info.aux.clone();
let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &user_info)
.or_else(|e| stream.throw_error(e))
.await?;
prepare_client_connection(&node, session, &mut stream).await?;
// Before proxy passing, forward to compute whatever data is left in the
// PqStream input buffer. Normally there is none, but our serverless npm
// driver in pipeline mode sends startup, password and first query
// immediately after opening the connection.
let (stream, read_buf) = stream.into_inner();
node.stream.write_all(&read_buf).await?;
proxy_pass(ctx, stream, node.stream, aux).await
}
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct NeonOptions(Vec<(SmolStr, SmolStr)>);
@@ -322,21 +516,20 @@ impl NeonOptions {
Self(options)
}
pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
pub fn get_cache_key(&self, prefix: &str) -> SmolStr {
// prefix + format!(" {k}:{v}")
// kinda jank because SmolStr is immutable
std::iter::once(prefix)
.chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v]))
.collect::<SmolStr>()
.into()
.collect()
}
/// <https://swagger.io/docs/specification/serialization/> DeepObject format
/// `paramName[prop1]=value1&paramName[prop2]=value2&...`
pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
pub fn to_deep_object(&self) -> Vec<(String, SmolStr)> {
self.0
.iter()
.map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone()))
.map(|(k, v)| (format!("options[{}]", k), v.clone()))
.collect()
}
}

View File

@@ -1,96 +0,0 @@
use anyhow::{bail, Context};
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::info;
use crate::{
cancellation::CancelMap,
config::TlsConfig,
proxy::{ERR_INSECURE_CONNECTION, ERR_PROTO_VIOLATION},
stream::{PqStream, Stream},
};
/// Establish a (most probably, secure) connection with the client.
/// For better testing experience, `stream` can be any object satisfying the traits.
/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
/// we also take an extra care of propagating only the select handshake errors to client.
#[tracing::instrument(skip_all)]
pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
stream: S,
mut tls: Option<&TlsConfig>,
cancel_map: &CancelMap,
) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
// Client may try upgrading to each protocol only once
let (mut tried_ssl, mut tried_gss) = (false, false);
let mut stream = PqStream::new(Stream::from_raw(stream));
loop {
let msg = stream.read_startup_packet().await?;
info!("received {msg:?}");
use FeStartupPacket::*;
match msg {
SslRequest => match stream.get_ref() {
Stream::Raw { .. } if !tried_ssl => {
tried_ssl = true;
// We can't perform TLS handshake without a config
let enc = tls.is_some();
stream.write_message(&Be::EncryptionResponse(enc)).await?;
if let Some(tls) = tls.take() {
// Upgrade raw stream into a secure TLS-backed stream.
// NOTE: We've consumed `tls`; this fact will be used later.
let (raw, read_buf) = stream.into_inner();
// TODO: Normally, client doesn't send any data before
// server says TLS handshake is ok and read_buf is empy.
// However, you could imagine pipelining of postgres
// SSLRequest + TLS ClientHello in one hunk similar to
// pipelining in our node js driver. We should probably
// support that by chaining read_buf with the stream.
if !read_buf.is_empty() {
bail!("data is sent before server replied with EncryptionResponse");
}
let tls_stream = raw.upgrade(tls.to_server_config()).await?;
let (_, tls_server_end_point) = tls
.cert_resolver
.resolve(tls_stream.get_ref().1.server_name())
.context("missing certificate")?;
stream = PqStream::new(Stream::Tls {
tls: Box::new(tls_stream),
tls_server_end_point,
});
}
}
_ => bail!(ERR_PROTO_VIOLATION),
},
GssEncRequest => match stream.get_ref() {
Stream::Raw { .. } if !tried_gss => {
tried_gss = true;
// Currently, we don't support GSSAPI
stream.write_message(&Be::EncryptionResponse(false)).await?;
}
_ => bail!(ERR_PROTO_VIOLATION),
},
StartupMessage { params, .. } => {
// Check that the config has been consumed during upgrade
// OR we didn't provide it at all (for dev purposes).
if tls.is_some() {
stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
}
info!(session_type = "normal", "successful handshake");
break Ok(Some((stream, params)));
}
CancelRequest(cancel_key_data) => {
cancel_map.cancel_session(cancel_key_data).await?;
info!(session_type = "cancellation", "successful handshake");
break Ok(None);
}
}
}
}

View File

@@ -1,57 +0,0 @@
use crate::{
console::messages::MetricsAuxInfo,
context::RequestMonitoring,
metrics::{NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER},
usage_metrics::{Ids, USAGE_METRICS},
};
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::info;
use utils::measured_stream::MeasuredStream;
/// Forward bytes in both directions (client <-> compute).
#[tracing::instrument(skip_all)]
pub async fn proxy_pass(
ctx: &mut RequestMonitoring,
client: impl AsyncRead + AsyncWrite + Unpin,
compute: impl AsyncRead + AsyncWrite + Unpin,
aux: MetricsAuxInfo,
) -> anyhow::Result<()> {
ctx.set_success();
ctx.log();
let usage = USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.clone(),
branch_id: aux.branch_id.clone(),
});
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
let mut client = MeasuredStream::new(
client,
|_| {},
|cnt| {
// Number of bytes we sent to the client (outbound).
m_sent.inc_by(cnt as u64);
m_sent2.inc_by(cnt as u64);
usage.record_egress(cnt as u64);
},
);
let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
let mut compute = MeasuredStream::new(
compute,
|_| {},
|cnt| {
// Number of bytes the client sent to the compute node (inbound).
m_recv.inc_by(cnt as u64);
m_recv2.inc_by(cnt as u64);
},
);
// Starting from here we only proxy the client's traffic.
info!("performing the proxy pass...");
let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
Ok(())
}

View File

@@ -11,12 +11,11 @@ use anyhow::bail;
use dashmap::DashMap;
use itertools::Itertools;
use rand::{rngs::StdRng, Rng, SeedableRng};
use smol_str::SmolStr;
use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
use tokio::time::{timeout, Duration, Instant};
use tracing::info;
use crate::EndpointId;
use super::{
limit_algorithm::{LimitAlgorithm, Sample},
RateLimiterConfig,
@@ -34,7 +33,7 @@ use super::{
// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
// I went with a more expensive way that yields user-friendlier error messages.
pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
map: DashMap<EndpointId, Vec<RateBucket>, Hasher>,
map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
info: &'static [RateBucketInfo],
access_count: AtomicUsize,
rand: Mutex<Rand>,
@@ -147,7 +146,7 @@ impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
}
/// Check that number of connections to the endpoint is below `max_rps` rps.
pub fn check(&self, endpoint: EndpointId) -> bool {
pub fn check(&self, endpoint: SmolStr) -> bool {
// do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
// worst case memory usage is about:
// = 2 * 2048 * 64 * (48B + 72B)
@@ -494,13 +493,11 @@ mod tests {
use futures::{task::noop_waker_ref, Future};
use rand::SeedableRng;
use rustc_hash::FxHasher;
use smol_str::SmolStr;
use tokio::time;
use super::{EndpointRateLimiter, Limiter, Outcome};
use crate::{
rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
EndpointId,
};
use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm};
#[tokio::test]
async fn it_works() {
@@ -657,7 +654,7 @@ mod tests {
RateBucketInfo::validate(&mut rates).unwrap();
let limiter = EndpointRateLimiter::new(Vec::leak(rates));
let endpoint = EndpointId::from("ep-my-endpoint-1234");
let endpoint = SmolStr::from("ep-my-endpoint-1234");
time::pause();

View File

@@ -3,8 +3,9 @@ use std::{convert::Infallible, sync::Arc};
use futures::StreamExt;
use redis::aio::PubSub;
use serde::Deserialize;
use smol_str::SmolStr;
use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName};
use crate::cache::project_info::ProjectInfoCache;
const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
@@ -45,12 +46,12 @@ enum Notification {
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
struct AllowedIpsUpdate {
project_id: ProjectId,
project_id: SmolStr,
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
struct PasswordUpdate {
project_id: ProjectId,
role_name: RoleName,
project_id: SmolStr,
role_name: SmolStr,
}
fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
where

View File

@@ -41,8 +41,6 @@ use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, warn, Instrument};
use utils::http::{error::ApiError, json::json_response};
pub const SERVERLESS_DRIVER_SNI: &str = "api";
pub async fn task_main(
config: &'static ProxyConfig,
ws_listener: TcpListener,
@@ -230,7 +228,7 @@ async fn request_handler(
config,
&mut ctx,
websocket,
cancel_map,
&cancel_map,
host,
endpoint_rate_limiter,
)

Some files were not shown because too many files have changed in this diff Show More