mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-29 00:00:38 +00:00
Compare commits
194 Commits
release-pr
...
release-61
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
28ee7cdede | ||
|
|
7b63092958 | ||
|
|
31bfeaf934 | ||
|
|
21b3a191bf | ||
|
|
f7f9b4aaec | ||
|
|
bba062e262 | ||
|
|
067363fe95 | ||
|
|
affe408433 | ||
|
|
9b883e4651 | ||
|
|
b98b301d56 | ||
|
|
ed7ee73cba | ||
|
|
fceace835b | ||
|
|
1b508a6082 | ||
|
|
f87b031876 | ||
|
|
9f1ba2c4bf | ||
|
|
9868bb3346 | ||
|
|
27da0e9cf5 | ||
|
|
de9bf2af6c | ||
|
|
3d2c2ce139 | ||
|
|
82a2081d61 | ||
|
|
ff174a88c0 | ||
|
|
ef3ebfaf67 | ||
|
|
ae1af558b4 | ||
|
|
c150ad4ee2 | ||
|
|
a98ccd185b | ||
|
|
9f796ebba9 | ||
|
|
d51ca338c4 | ||
|
|
07e78102bf | ||
|
|
b21e131d11 | ||
|
|
abe3b4e005 | ||
|
|
18e7c2b7a1 | ||
|
|
ad5d784fb7 | ||
|
|
85d47637ee | ||
|
|
7e818ee390 | ||
|
|
bff505426e | ||
|
|
bf7de92dc2 | ||
|
|
9dc71f5a88 | ||
|
|
2ede9d7a25 | ||
|
|
ea5460843c | ||
|
|
5b16624bcc | ||
|
|
349373cb11 | ||
|
|
957f99cad5 | ||
|
|
2a3a136474 | ||
|
|
cfaf30f5e8 | ||
|
|
72c2d0812e | ||
|
|
537ecf45f8 | ||
|
|
1637a6ee05 | ||
|
|
d74fb7b879 | ||
|
|
7973c3e941 | ||
|
|
085bbaf5f8 | ||
|
|
85b5219861 | ||
|
|
7472c69954 | ||
|
|
3f8819827c | ||
|
|
c440756410 | ||
|
|
0e600eb921 | ||
|
|
a1df835e28 | ||
|
|
119ddf6ccf | ||
|
|
90f447b79d | ||
|
|
7dd71f4126 | ||
|
|
8532d72276 | ||
|
|
d3ff47f572 | ||
|
|
8cc768254f | ||
|
|
5c80743c9c | ||
|
|
5bba3e3c75 | ||
|
|
6caf702417 | ||
|
|
32f668f5e7 | ||
|
|
a91f9d5832 | ||
|
|
547acde6cd | ||
|
|
bea6532881 | ||
|
|
8e2fe6b22e | ||
|
|
4d75e1ef81 | ||
|
|
4c7c00268c | ||
|
|
f28abb953d | ||
|
|
4df39d7304 | ||
|
|
bfc7338246 | ||
|
|
35dac6e6c8 | ||
|
|
e619e8703e | ||
|
|
6fd35bfe32 | ||
|
|
547a431b0d | ||
|
|
f8c01c6341 | ||
|
|
1145700f87 | ||
|
|
44339f5b70 | ||
|
|
7b4a9c1d82 | ||
|
|
3b2fc27de4 | ||
|
|
0b6492e7d3 | ||
|
|
7cfaecbeb6 | ||
|
|
472acae615 | ||
|
|
108bf56e44 | ||
|
|
e83a499ab4 | ||
|
|
ebf3bfadde | ||
|
|
ab06240fae | ||
|
|
cec216c5c0 | ||
|
|
930201e033 | ||
|
|
8328580dc2 | ||
|
|
8d9b632f2a | ||
|
|
55d37c77b9 | ||
|
|
0948fb6bf1 | ||
|
|
285c6d2974 | ||
|
|
a5491463e1 | ||
|
|
a58827f952 | ||
|
|
36b790f282 | ||
|
|
3ef7748e6b | ||
|
|
f3310143e4 | ||
|
|
05b4169644 | ||
|
|
d1495755e7 | ||
|
|
c8dd78c6c8 | ||
|
|
b44ee3950a | ||
|
|
64334f497d | ||
|
|
5ffcb688cc | ||
|
|
32fc2dd683 | ||
|
|
d35ddfbab7 | ||
|
|
3ee82a9895 | ||
|
|
e770aeee92 | ||
|
|
32828cddd6 | ||
|
|
bd2046e1ab | ||
|
|
7e2a3d2728 | ||
|
|
0e4832308d | ||
|
|
0a63bc4818 | ||
|
|
2897dcc9aa | ||
|
|
1d0ec50ddb | ||
|
|
a86b43fcd7 | ||
|
|
b917868ada | ||
|
|
7b7d16f52e | ||
|
|
fee4169b6b | ||
|
|
47e06a2cc6 | ||
|
|
c4423c0623 | ||
|
|
a11cf03123 | ||
|
|
08b33adfee | ||
|
|
4fb50144dd | ||
|
|
c500137ca9 | ||
|
|
252c4acec9 | ||
|
|
db70c175e6 | ||
|
|
ed3b4a58b4 | ||
|
|
2863d1df63 | ||
|
|
320b24eab3 | ||
|
|
13a8a5b09b | ||
|
|
64ccdf65e0 | ||
|
|
1ae6aa09dd | ||
|
|
aeb68e51df | ||
|
|
c3e5223a5d | ||
|
|
daaa3211a4 | ||
|
|
7ff9989dd5 | ||
|
|
ed3b97604c | ||
|
|
47c50ec460 | ||
|
|
8c0ec2f681 | ||
|
|
588bda98e7 | ||
|
|
504ca7720f | ||
|
|
cf4ea92aad | ||
|
|
325294bced | ||
|
|
86c8ba2563 | ||
|
|
feeb2dc6fa | ||
|
|
57f476ff5a | ||
|
|
7ee2bebdb7 | ||
|
|
be598f1bf4 | ||
|
|
939b5954a5 | ||
|
|
371020fe6a | ||
|
|
f45818abed | ||
|
|
0384267d58 | ||
|
|
62b3bd968a | ||
|
|
e3e3bc3542 | ||
|
|
be014a2222 | ||
|
|
2e1fe71cc0 | ||
|
|
068c158ca5 | ||
|
|
b16e4f689f | ||
|
|
dbff725a0c | ||
|
|
7fa4628434 | ||
|
|
fc538a38b9 | ||
|
|
c2e7cb324f | ||
|
|
101043122e | ||
|
|
c4d7d59825 | ||
|
|
0de1e1d664 | ||
|
|
271598b77f | ||
|
|
459bc479dc | ||
|
|
c213373a59 | ||
|
|
e0addc100d | ||
|
|
0519138b04 | ||
|
|
5da39b469c | ||
|
|
82027e22dd | ||
|
|
c431e2f1c5 | ||
|
|
4e5724d9c3 | ||
|
|
0d3e499059 | ||
|
|
7b860b837c | ||
|
|
41fc96e20f | ||
|
|
fb2b1ce57b | ||
|
|
464717451b | ||
|
|
c6ed86d3d0 | ||
|
|
f0a9017008 | ||
|
|
bb7949ba00 | ||
|
|
1df0f69664 | ||
|
|
970066a914 | ||
|
|
1ebd3897c0 | ||
|
|
6460beffcd | ||
|
|
6f7f8958db | ||
|
|
936a00e077 |
2
.gitattributes
vendored
2
.gitattributes
vendored
@@ -1,2 +0,0 @@
|
||||
# allows for nicer hunk headers with git show
|
||||
*.rs diff=rust
|
||||
291
.github/workflows/_build-and-test-locally.yml
vendored
291
.github/workflows/_build-and-test-locally.yml
vendored
@@ -1,291 +0,0 @@
|
||||
name: Build and Test Locally
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
arch:
|
||||
description: 'x64 or arm64'
|
||||
required: true
|
||||
type: string
|
||||
build-tag:
|
||||
description: 'build tag'
|
||||
required: true
|
||||
type: string
|
||||
build-tools-image:
|
||||
description: 'build-tools image'
|
||||
required: true
|
||||
type: string
|
||||
build-type:
|
||||
description: 'debug or release'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
COPT: '-Werror'
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
jobs:
|
||||
build-neon:
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# Raise locked memory limit for tokio-epoll-uring.
|
||||
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
|
||||
# io_uring will account the memory of the CQ and SQ as locked.
|
||||
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
env:
|
||||
BUILD_TYPE: ${{ inputs.build-type }}
|
||||
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG: ${{ inputs.build-tag }}
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 16 revision for caching
|
||||
id: pg_v16_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
|
||||
|
||||
# Set some environment variables used by all the steps.
|
||||
#
|
||||
# CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
|
||||
# It also includes --features, if any
|
||||
#
|
||||
# CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
|
||||
# because "cargo metadata" doesn't accept --release or --debug options
|
||||
#
|
||||
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
|
||||
# corresponding Cargo.toml files for their descriptions.
|
||||
- name: Set env variables
|
||||
run: |
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked --release"
|
||||
fi
|
||||
{
|
||||
echo "cov_prefix=${cov_prefix}"
|
||||
echo "CARGO_FEATURES=${CARGO_FEATURES}"
|
||||
echo "CARGO_FLAGS=${CARGO_FLAGS}"
|
||||
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
|
||||
} >> $GITHUB_ENV
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v14 -j$(nproc)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v15 -j$(nproc)
|
||||
|
||||
- name: Build postgres v16
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v16 -j$(nproc)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: mold -run make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
# Do install *before* running rust tests because they might recompile the
|
||||
# binaries with different features/flags.
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
# Install target binaries
|
||||
mkdir -p /tmp/neon/bin/
|
||||
binaries=$(
|
||||
${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
|
||||
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
||||
)
|
||||
for bin in $binaries; do
|
||||
SRC=target/$BUILD_TYPE/$bin
|
||||
DST=/tmp/neon/bin/$bin
|
||||
cp "$SRC" "$DST"
|
||||
done
|
||||
|
||||
# Install test executables and write list of all binaries (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
# Keep bloated coverage data files away from the rest of the artifact
|
||||
mkdir -p /tmp/coverage/
|
||||
|
||||
mkdir -p /tmp/neon/test_bin/
|
||||
|
||||
test_exe_paths=$(
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
|
||||
jq -r '.executable | select(. != null)'
|
||||
)
|
||||
for bin in $test_exe_paths; do
|
||||
SRC=$bin
|
||||
DST=/tmp/neon/test_bin/$(basename $bin)
|
||||
|
||||
# We don't need debug symbols for code coverage, so strip them out to make
|
||||
# the artifact smaller.
|
||||
strip "$SRC" -o "$DST"
|
||||
echo "$DST" >> /tmp/coverage/binaries.list
|
||||
done
|
||||
|
||||
for bin in $binaries; do
|
||||
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
|
||||
done
|
||||
fi
|
||||
|
||||
- name: Run rust tests
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
#nextest does not yet support running doctests
|
||||
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
|
||||
- name: Install postgres binaries
|
||||
run: cp -a pg_install /tmp/neon/pg_install
|
||||
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||
- name: Merge and upload coverage data
|
||||
if: inputs.build-type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
# Run test on x64 only
|
||||
if: inputs.arch == 'x64'
|
||||
needs: [ build-neon ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pg_version: [ v14, v15, v16 ]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: 60
|
||||
with:
|
||||
build_type: ${{ inputs.build-type }}
|
||||
test_selection: regress
|
||||
needs_postgres_source: true
|
||||
run_with_real_s3: true
|
||||
real_s3_bucket: neon-github-ci-tests
|
||||
real_s3_region: eu-central-1
|
||||
rerun_flaky: true
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ inputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: true
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
- name: Merge and upload coverage data
|
||||
if: |
|
||||
false &&
|
||||
inputs.build-type == 'debug' && matrix.pg_version == 'v14'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
328
.github/workflows/build_and_test.yml
vendored
328
.github/workflows/build_and_test.yml
vendored
@@ -125,11 +125,7 @@ jobs:
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
@@ -197,27 +193,291 @@ jobs:
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ tag, build-build-tools-image ]
|
||||
build-neon:
|
||||
needs: [ check-permissions, tag, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# Raise locked memory limit for tokio-epoll-uring.
|
||||
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
|
||||
# io_uring will account the memory of the CQ and SQ as locked.
|
||||
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64 ]
|
||||
build-type: [ debug, release ]
|
||||
include:
|
||||
- build-type: release
|
||||
arch: arm64
|
||||
uses: ./.github/workflows/_build-and-test-locally.yml
|
||||
with:
|
||||
arch: ${{ matrix.arch }}
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
secrets: inherit
|
||||
build_type: [ debug, release ]
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build_type }}
|
||||
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 16 revision for caching
|
||||
id: pg_v16_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
|
||||
|
||||
# Set some environment variables used by all the steps.
|
||||
#
|
||||
# CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
|
||||
# It also includes --features, if any
|
||||
#
|
||||
# CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
|
||||
# because "cargo metadata" doesn't accept --release or --debug options
|
||||
#
|
||||
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
|
||||
# corresponding Cargo.toml files for their descriptions.
|
||||
- name: Set env variables
|
||||
run: |
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked --release"
|
||||
fi
|
||||
{
|
||||
echo "cov_prefix=${cov_prefix}"
|
||||
echo "CARGO_FEATURES=${CARGO_FEATURES}"
|
||||
echo "CARGO_FLAGS=${CARGO_FLAGS}"
|
||||
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
|
||||
} >> $GITHUB_ENV
|
||||
|
||||
# Disabled for now
|
||||
# Don't include the ~/.cargo/registry/src directory. It contains just
|
||||
# uncompressed versions of the crates in ~/.cargo/registry/cache
|
||||
# directory, and it's faster to let 'cargo' to rebuild it from the
|
||||
# compressed crates.
|
||||
# - name: Cache cargo deps
|
||||
# id: cache_cargo
|
||||
# uses: actions/cache@v4
|
||||
# with:
|
||||
# path: |
|
||||
# ~/.cargo/registry/
|
||||
# !~/.cargo/registry/src
|
||||
# ~/.cargo/git/
|
||||
# target/
|
||||
# # Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
# key: |
|
||||
# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v14 -j$(nproc)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v15 -j$(nproc)
|
||||
|
||||
- name: Build postgres v16
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v16 -j$(nproc)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: mold -run make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
# Do install *before* running rust tests because they might recompile the
|
||||
# binaries with different features/flags.
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
# Install target binaries
|
||||
mkdir -p /tmp/neon/bin/
|
||||
binaries=$(
|
||||
${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
|
||||
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
||||
)
|
||||
for bin in $binaries; do
|
||||
SRC=target/$BUILD_TYPE/$bin
|
||||
DST=/tmp/neon/bin/$bin
|
||||
cp "$SRC" "$DST"
|
||||
done
|
||||
|
||||
# Install test executables and write list of all binaries (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
# Keep bloated coverage data files away from the rest of the artifact
|
||||
mkdir -p /tmp/coverage/
|
||||
|
||||
mkdir -p /tmp/neon/test_bin/
|
||||
|
||||
test_exe_paths=$(
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
|
||||
jq -r '.executable | select(. != null)'
|
||||
)
|
||||
for bin in $test_exe_paths; do
|
||||
SRC=$bin
|
||||
DST=/tmp/neon/test_bin/$(basename $bin)
|
||||
|
||||
# We don't need debug symbols for code coverage, so strip them out to make
|
||||
# the artifact smaller.
|
||||
strip "$SRC" -o "$DST"
|
||||
echo "$DST" >> /tmp/coverage/binaries.list
|
||||
done
|
||||
|
||||
for bin in $binaries; do
|
||||
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
|
||||
done
|
||||
fi
|
||||
|
||||
- name: Run rust tests
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
#nextest does not yet support running doctests
|
||||
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
|
||||
- name: Install postgres binaries
|
||||
run: cp -a pg_install /tmp/neon/pg_install
|
||||
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
pg_version: [ v14, v15, v16 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: 60
|
||||
with:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
test_selection: regress
|
||||
needs_postgres_source: true
|
||||
run_with_real_s3: true
|
||||
real_s3_bucket: neon-github-ci-tests
|
||||
real_s3_region: eu-central-1
|
||||
rerun_flaky: true
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: true
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
- name: Merge and upload coverage data
|
||||
if: |
|
||||
false &&
|
||||
matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
|
||||
get-benchmarks-durations:
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
outputs:
|
||||
json: ${{ steps.get-benchmark-durations.outputs.json }}
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
@@ -228,6 +488,7 @@ jobs:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -252,8 +513,7 @@ jobs:
|
||||
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
|
||||
|
||||
benchmarks:
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
|
||||
needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
@@ -262,6 +522,7 @@ jobs:
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -309,7 +570,7 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
outputs:
|
||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
@@ -360,7 +621,7 @@ jobs:
|
||||
})
|
||||
|
||||
coverage-report:
|
||||
needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
|
||||
needs: [ check-permissions, regress-tests, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
@@ -511,8 +772,7 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
|
||||
# 23.07.2024 temporarily disable cache saving in the registry as it is very slow
|
||||
# cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
@@ -605,8 +865,7 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
# 23.07.2024 temporarily disable cache saving in the registry as it is very slow
|
||||
# cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
@@ -626,8 +885,7 @@ jobs:
|
||||
file: Dockerfile.compute-node
|
||||
target: neon-pg-ext-test
|
||||
cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
# 23.07.2024 temporarily disable cache saving in the registry as it is very slow
|
||||
# cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
|
||||
|
||||
@@ -965,7 +1223,7 @@ jobs:
|
||||
exit 1
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -1066,7 +1324,7 @@ jobs:
|
||||
})
|
||||
|
||||
promote-compatibility-data:
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
|
||||
needs: [ check-permissions, promote-images, tag, regress-tests ]
|
||||
if: github.ref_name == 'release'
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -1105,7 +1363,7 @@ jobs:
|
||||
done
|
||||
|
||||
pin-build-tools-image:
|
||||
needs: [ build-build-tools-image, promote-images, build-and-test-locally ]
|
||||
needs: [ build-build-tools-image, promote-images, regress-tests ]
|
||||
if: github.ref_name == 'main'
|
||||
uses: ./.github/workflows/pin-build-tools-image.yml
|
||||
with:
|
||||
@@ -1127,7 +1385,7 @@ jobs:
|
||||
needs:
|
||||
- check-codestyle-python
|
||||
- check-codestyle-rust
|
||||
- build-and-test-locally
|
||||
- regress-tests
|
||||
- test-images
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
|
||||
215
.github/workflows/neon_extra_builds.yml
vendored
215
.github/workflows/neon_extra_builds.yml
vendored
@@ -133,6 +133,221 @@ jobs:
|
||||
- name: Check that no warnings are produced
|
||||
run: ./run_clippy.sh
|
||||
|
||||
check-linux-arm-build:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
timeout-minutes: 90
|
||||
runs-on: [ self-hosted, small-arm64 ]
|
||||
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
CARGO_FEATURES: --features testing
|
||||
CARGO_FLAGS: --release
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 16 revision for caching
|
||||
id: pg_v16_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set env variables
|
||||
run: |
|
||||
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v14 -j$(nproc)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v15 -j$(nproc)
|
||||
|
||||
- name: Build postgres v16
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v16 -j$(nproc)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: mold -run make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
|
||||
|
||||
- name: Run cargo test
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
cargo nextest run $CARGO_FEATURES -j$(nproc)
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
|
||||
|
||||
check-codestyle-rust-arm:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
timeout-minutes: 90
|
||||
runs-on: [ self-hosted, small-arm64 ]
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
|
||||
- name: Run cargo clippy (debug)
|
||||
if: matrix.build_type == 'debug'
|
||||
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
- name: Run cargo clippy (release)
|
||||
if: matrix.build_type == 'release'
|
||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
if: matrix.build_type == 'release'
|
||||
run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() && matrix.build_type == 'release' }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() && matrix.build_type == 'release' }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() && matrix.build_type == 'release' }}
|
||||
run: cargo deny check
|
||||
|
||||
gather-rust-build-stats:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
if: |
|
||||
|
||||
66
Cargo.lock
generated
66
Cargo.lock
generated
@@ -261,6 +261,15 @@ version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
|
||||
|
||||
[[package]]
|
||||
name = "atomic-polyfill"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289"
|
||||
dependencies = [
|
||||
"critical-section",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atomic-take"
|
||||
version = "1.1.0"
|
||||
@@ -1442,6 +1451,12 @@ dependencies = [
|
||||
"itertools",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "critical-section"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.8"
|
||||
@@ -2267,6 +2282,15 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hash32"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
@@ -2315,6 +2339,18 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heapless"
|
||||
version = "0.8.0"
|
||||
source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963be2de24804acf5e5001#644653bf3b831c6bb4963be2de24804acf5e5001"
|
||||
dependencies = [
|
||||
"atomic-polyfill",
|
||||
"hash32",
|
||||
"rustc_version",
|
||||
"spin 0.9.8",
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
@@ -2348,6 +2384,16 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
|
||||
|
||||
[[package]]
|
||||
name = "histogram"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hmac"
|
||||
version = "0.12.1"
|
||||
@@ -3967,7 +4013,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3980,7 +4026,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -3999,7 +4045,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4612,7 +4658,6 @@ name = "remote_storage"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"aws-config",
|
||||
"aws-credential-types",
|
||||
@@ -5655,6 +5700,9 @@ name = "spin"
|
||||
version = "0.9.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spki"
|
||||
@@ -5676,6 +5724,12 @@ dependencies = [
|
||||
"der 0.7.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
||||
|
||||
[[package]]
|
||||
name = "static_assertions"
|
||||
version = "1.1.0"
|
||||
@@ -5793,6 +5847,7 @@ dependencies = [
|
||||
"futures",
|
||||
"futures-util",
|
||||
"hex",
|
||||
"histogram",
|
||||
"humantime",
|
||||
"itertools",
|
||||
"once_cell",
|
||||
@@ -6218,7 +6273,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6772,6 +6827,7 @@ dependencies = [
|
||||
"criterion",
|
||||
"fail",
|
||||
"futures",
|
||||
"heapless",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
|
||||
@@ -204,6 +204,9 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
|
||||
18
Dockerfile
18
Dockerfile
@@ -93,14 +93,13 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
|
||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
|
||||
RUN mkdir -p /data/.neon/ && \
|
||||
echo "id=1234" > "/data/.neon/identity.toml" && \
|
||||
echo "broker_endpoint='http://storage_broker:50051'\n" \
|
||||
"pg_distrib_dir='/usr/local/'\n" \
|
||||
"listen_pg_addr='0.0.0.0:6400'\n" \
|
||||
"listen_http_addr='0.0.0.0:9898'\n" \
|
||||
> /data/.neon/pageserver.toml && \
|
||||
chown -R neon:neon /data/.neon
|
||||
RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
|
||||
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
|
||||
-c "id=1234" \
|
||||
-c "broker_endpoint='http://storage_broker:50051'" \
|
||||
-c "pg_distrib_dir='/usr/local/'" \
|
||||
-c "listen_pg_addr='0.0.0.0:6400'" \
|
||||
-c "listen_http_addr='0.0.0.0:9898'"
|
||||
|
||||
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
|
||||
# that want a particular postgres version will select it explicitly: this is just a default.
|
||||
@@ -111,6 +110,3 @@ VOLUME ["/data"]
|
||||
USER neon
|
||||
EXPOSE 6400
|
||||
EXPOSE 9898
|
||||
|
||||
CMD /usr/local/bin/pageserver -D /data/.neon
|
||||
|
||||
|
||||
13
Makefile
13
Makefile
@@ -69,8 +69,6 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
|
||||
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
|
||||
|
||||
CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
|
||||
|
||||
#
|
||||
# Top level Makefile to build Neon and PostgreSQL
|
||||
#
|
||||
@@ -81,24 +79,15 @@ all: neon postgres neon-pg-ext
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: neon
|
||||
neon: postgres-headers walproposer-lib cargo-target-dir
|
||||
neon: postgres-headers walproposer-lib
|
||||
+@echo "Compiling Neon"
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||
.PHONY: cargo-target-dir
|
||||
cargo-target-dir:
|
||||
# https://github.com/rust-lang/cargo/issues/14281
|
||||
mkdir -p target
|
||||
test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG
|
||||
|
||||
### PostgreSQL parts
|
||||
# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
|
||||
# to avoid the duplication in the future, but it's tolerable for now.
|
||||
#
|
||||
$(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
||||
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)
|
||||
test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG
|
||||
|
||||
+@echo "Configuring Postgres $* build"
|
||||
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
|
||||
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
|
||||
|
||||
@@ -25,7 +25,6 @@ use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::NodeId;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -75,10 +74,6 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
|
||||
toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
|
||||
}
|
||||
|
||||
fn pageserver_init_make_toml(
|
||||
&self,
|
||||
conf: NeonLocalInitPageserverConf,
|
||||
@@ -191,19 +186,6 @@ impl PageServerNode {
|
||||
.write_all(config.to_string().as_bytes())
|
||||
.context("write pageserver toml")?;
|
||||
drop(config_file);
|
||||
|
||||
let identity_file_path = datadir.join("identity.toml");
|
||||
let mut identity_file = std::fs::OpenOptions::new()
|
||||
.create_new(true)
|
||||
.write(true)
|
||||
.open(identity_file_path)
|
||||
.with_context(|| format!("open identity toml for write: {config_file_path:?}"))?;
|
||||
let identity_toml = self.pageserver_make_identity_toml(node_id);
|
||||
identity_file
|
||||
.write_all(identity_toml.to_string().as_bytes())
|
||||
.context("write identity toml")?;
|
||||
drop(identity_toml);
|
||||
|
||||
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
|
||||
|
||||
// Write metadata file, used by pageserver on startup to register itself with
|
||||
|
||||
@@ -33,7 +33,7 @@ echo $result | jq .
|
||||
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
-sb
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
|
||||
|
||||
@@ -31,14 +31,25 @@ services:
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
environment:
|
||||
- BROKER_ENDPOINT='http://storage_broker:50051'
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
- AWS_SECRET_ACCESS_KEY=password
|
||||
#- RUST_BACKTRACE=1
|
||||
ports:
|
||||
#- 6400:6400 # pg protocol handler
|
||||
- 9898:9898 # http endpoints
|
||||
volumes:
|
||||
- ./pageserver_config:/data/.neon/
|
||||
entrypoint:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
- "/usr/local/bin/pageserver -D /data/.neon/
|
||||
-c \"broker_endpoint=$$BROKER_ENDPOINT\"
|
||||
-c \"listen_pg_addr='0.0.0.0:6400'\"
|
||||
-c \"listen_http_addr='0.0.0.0:9898'\"
|
||||
-c \"remote_storage={endpoint='http://minio:9000',
|
||||
bucket_name='neon',
|
||||
bucket_region='eu-north-1',
|
||||
prefix_in_bucket='/pageserver/'}\""
|
||||
depends_on:
|
||||
- storage_broker
|
||||
- minio_create_buckets
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
id=1234
|
||||
@@ -1,5 +0,0 @@
|
||||
broker_endpoint='http://storage_broker:50051'
|
||||
pg_distrib_dir='/usr/local/'
|
||||
listen_pg_addr='0.0.0.0:6400'
|
||||
listen_http_addr='0.0.0.0:9898'
|
||||
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
|
||||
@@ -5,6 +5,7 @@ pub mod utilization;
|
||||
pub use utilization::PageserverUtilization;
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
@@ -19,6 +20,7 @@ use serde::{Deserialize, Serialize};
|
||||
use serde_with::serde_as;
|
||||
use utils::{
|
||||
completion,
|
||||
history_buffer::HistoryBufferWithDropCounter,
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
serde_system_time,
|
||||
@@ -724,7 +726,58 @@ pub struct LayerMapInfo {
|
||||
pub historic_layers: Vec<HistoricLayerInfo>,
|
||||
}
|
||||
|
||||
/// The residence status of a layer
|
||||
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)]
|
||||
#[repr(usize)]
|
||||
pub enum LayerAccessKind {
|
||||
GetValueReconstructData,
|
||||
Iter,
|
||||
KeyIter,
|
||||
Dump,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerAccessStatFullDetails {
|
||||
pub when_millis_since_epoch: u64,
|
||||
pub task_kind: Cow<'static, str>,
|
||||
pub access_kind: LayerAccessKind,
|
||||
}
|
||||
|
||||
/// An event that impacts the layer's residence status.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerResidenceEvent {
|
||||
/// The time when the event occurred.
|
||||
/// NB: this timestamp is captured while the residence status changes.
|
||||
/// So, it might be behind/ahead of the actual residence change by a short amount of time.
|
||||
///
|
||||
#[serde(rename = "timestamp_millis_since_epoch")]
|
||||
#[serde_as(as = "serde_with::TimestampMilliSeconds")]
|
||||
pub timestamp: SystemTime,
|
||||
/// The new residence status of the layer.
|
||||
pub status: LayerResidenceStatus,
|
||||
/// The reason why we had to record this event.
|
||||
pub reason: LayerResidenceEventReason,
|
||||
}
|
||||
|
||||
/// The reason for recording a given [`LayerResidenceEvent`].
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub enum LayerResidenceEventReason {
|
||||
/// The layer map is being populated, e.g. during timeline load or attach.
|
||||
/// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
|
||||
/// We need to record such events because there is no persistent storage for the events.
|
||||
///
|
||||
// https://github.com/rust-lang/rust/issues/74481
|
||||
/// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
|
||||
/// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
|
||||
LayerLoad,
|
||||
/// We just created the layer (e.g., freeze_and_flush or compaction).
|
||||
/// Such layers are always [`LayerResidenceStatus::Resident`].
|
||||
LayerCreate,
|
||||
/// We on-demand downloaded or evicted the given layer.
|
||||
ResidenceChange,
|
||||
}
|
||||
|
||||
/// The residence status of the layer, after the given [`LayerResidenceEvent`].
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub enum LayerResidenceStatus {
|
||||
/// Residence status for a layer file that exists locally.
|
||||
@@ -734,16 +787,23 @@ pub enum LayerResidenceStatus {
|
||||
Evicted,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
impl LayerResidenceEvent {
|
||||
pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self {
|
||||
Self {
|
||||
status,
|
||||
reason,
|
||||
timestamp: SystemTime::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerAccessStats {
|
||||
#[serde_as(as = "serde_with::TimestampMilliSeconds")]
|
||||
pub access_time: SystemTime,
|
||||
|
||||
#[serde_as(as = "serde_with::TimestampMilliSeconds")]
|
||||
pub residence_time: SystemTime,
|
||||
|
||||
pub visible: bool,
|
||||
pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
|
||||
pub task_kind_access_flag: Vec<Cow<'static, str>>,
|
||||
pub first: Option<LayerAccessStatFullDetails>,
|
||||
pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
|
||||
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
||||
@@ -144,20 +144,7 @@ impl PgConnectionConfig {
|
||||
// implement and this function is hardly a bottleneck. The function is only called around
|
||||
// establishing a new connection.
|
||||
#[allow(unstable_name_collisions)]
|
||||
config.options(
|
||||
&self
|
||||
.options
|
||||
.iter()
|
||||
.map(|s| {
|
||||
if s.contains(['\\', ' ']) {
|
||||
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
|
||||
} else {
|
||||
Cow::Borrowed(s.as_str())
|
||||
}
|
||||
})
|
||||
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
|
||||
.collect::<String>(),
|
||||
);
|
||||
config.options(&encode_options(&self.options));
|
||||
}
|
||||
config
|
||||
}
|
||||
@@ -178,6 +165,21 @@ impl PgConnectionConfig {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unstable_name_collisions)]
|
||||
fn encode_options(options: &[String]) -> String {
|
||||
options
|
||||
.iter()
|
||||
.map(|s| {
|
||||
if s.contains(['\\', ' ']) {
|
||||
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
|
||||
} else {
|
||||
Cow::Borrowed(s.as_str())
|
||||
}
|
||||
})
|
||||
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
impl fmt::Display for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// The password is intentionally hidden and not part of this display string.
|
||||
@@ -206,7 +208,7 @@ impl fmt::Debug for PgConnectionConfig {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_pg_connection_config {
|
||||
use crate::PgConnectionConfig;
|
||||
use crate::{encode_options, PgConnectionConfig};
|
||||
use once_cell::sync::Lazy;
|
||||
use url::Host;
|
||||
|
||||
@@ -255,18 +257,12 @@ mod tests_pg_connection_config {
|
||||
|
||||
#[test]
|
||||
fn test_with_options() {
|
||||
let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
|
||||
"hello",
|
||||
"world",
|
||||
"with space",
|
||||
"and \\ backslashes",
|
||||
let options = encode_options(&[
|
||||
"hello".to_owned(),
|
||||
"world".to_owned(),
|
||||
"with space".to_owned(),
|
||||
"and \\ backslashes".to_owned(),
|
||||
]);
|
||||
assert_eq!(cfg.host(), &*STUB_HOST);
|
||||
assert_eq!(cfg.port(), 123);
|
||||
assert_eq!(cfg.raw_address(), "stub.host.example:123");
|
||||
assert_eq!(
|
||||
cfg.to_tokio_postgres_config().get_options(),
|
||||
Some("hello world with\\ space and\\ \\\\\\ backslashes")
|
||||
);
|
||||
assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ license.workspace = true
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
async-stream.workspace = true
|
||||
once_cell.workspace = true
|
||||
aws-smithy-async.workspace = true
|
||||
aws-smithy-types.workspace = true
|
||||
|
||||
@@ -15,7 +15,7 @@ use std::time::SystemTime;
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Result;
|
||||
use azure_core::request_options::{MaxResults, Metadata, Range};
|
||||
use azure_core::{Continuable, RetryOptions};
|
||||
use azure_core::RetryOptions;
|
||||
use azure_identity::DefaultAzureCredential;
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::blob::CopyStatus;
|
||||
@@ -40,7 +40,6 @@ use crate::{
|
||||
|
||||
pub struct AzureBlobStorage {
|
||||
client: ContainerClient,
|
||||
container_name: String,
|
||||
prefix_in_container: Option<String>,
|
||||
max_keys_per_list_response: Option<NonZeroU32>,
|
||||
concurrency_limiter: ConcurrencyLimiter,
|
||||
@@ -86,7 +85,6 @@ impl AzureBlobStorage {
|
||||
|
||||
Ok(AzureBlobStorage {
|
||||
client,
|
||||
container_name: azure_config.container_name.to_owned(),
|
||||
prefix_in_container: azure_config.prefix_in_container.to_owned(),
|
||||
max_keys_per_list_response,
|
||||
concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
|
||||
@@ -240,10 +238,6 @@ impl AzureBlobStorage {
|
||||
_ = cancel.cancelled() => Err(Cancelled),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn container_name(&self) -> &str {
|
||||
&self.container_name
|
||||
}
|
||||
}
|
||||
|
||||
fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
|
||||
@@ -267,30 +261,30 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
|
||||
}
|
||||
|
||||
impl RemoteStorage for AzureBlobStorage {
|
||||
fn list_streaming(
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
.or_else(|| self.prefix_in_container.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
let _permit = self.permit(RequestKind::List, cancel).await?;
|
||||
|
||||
async_stream::stream! {
|
||||
let _permit = self.permit(RequestKind::List, cancel).await?;
|
||||
let op = async {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
.or_else(|| self.prefix_in_container.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let mut builder = self.client.list_blobs();
|
||||
|
||||
@@ -306,43 +300,21 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
builder = builder.max_results(MaxResults::new(limit));
|
||||
}
|
||||
|
||||
let mut next_marker = None;
|
||||
let response = builder.into_stream();
|
||||
let response = response.into_stream().map_err(to_download_error);
|
||||
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
|
||||
let response = response.map(|res| match res {
|
||||
Ok(res) => res,
|
||||
Err(_elapsed) => Err(DownloadError::Timeout),
|
||||
});
|
||||
|
||||
'outer: loop {
|
||||
let mut builder = builder.clone();
|
||||
if let Some(marker) = next_marker.clone() {
|
||||
builder = builder.marker(marker);
|
||||
}
|
||||
let response = builder.into_stream();
|
||||
let response = response.into_stream().map_err(to_download_error);
|
||||
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
|
||||
let response = response.map(|res| match res {
|
||||
Ok(res) => res,
|
||||
Err(_elapsed) => Err(DownloadError::Timeout),
|
||||
});
|
||||
let mut response = std::pin::pin!(response);
|
||||
|
||||
let mut response = std::pin::pin!(response);
|
||||
let mut res = Listing::default();
|
||||
|
||||
let mut max_keys = max_keys.map(|mk| mk.get());
|
||||
let next_item = tokio::select! {
|
||||
op = response.next() => Ok(op),
|
||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
||||
}?;
|
||||
let Some(entry) = next_item else {
|
||||
// The list is complete, so yield it.
|
||||
break;
|
||||
};
|
||||
|
||||
let mut res = Listing::default();
|
||||
let entry = match entry {
|
||||
Ok(entry) => entry,
|
||||
Err(e) => {
|
||||
// The error is potentially retryable, so we must rewind the loop after yielding.
|
||||
yield Err(e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
next_marker = entry.continuation();
|
||||
let mut max_keys = max_keys.map(|mk| mk.get());
|
||||
while let Some(entry) = response.next().await {
|
||||
let entry = entry?;
|
||||
let prefix_iter = entry
|
||||
.blobs
|
||||
.prefixes()
|
||||
@@ -361,19 +333,19 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
assert!(mk > 0);
|
||||
mk -= 1;
|
||||
if mk == 0 {
|
||||
yield Ok(res); // limit reached
|
||||
break 'outer;
|
||||
return Ok(res); // limit reached
|
||||
}
|
||||
max_keys = Some(mk);
|
||||
}
|
||||
}
|
||||
yield Ok(res);
|
||||
|
||||
// We are done here
|
||||
if next_marker.is_none() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(res)
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::{stream::Stream, StreamExt};
|
||||
use futures::stream::Stream;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -160,18 +160,13 @@ pub struct Listing {
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[allow(async_fn_in_trait)]
|
||||
pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`].
|
||||
///
|
||||
/// The stream is guaranteed to return at least one element, even in the case of errors
|
||||
/// (in that case it's an `Err()`), or an empty `Listing`.
|
||||
///
|
||||
/// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error.
|
||||
/// The `next` function can be retried, and maybe in a future retry, there will be success.
|
||||
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
|
||||
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
|
||||
///
|
||||
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
|
||||
/// from the absolute root of the bucket.
|
||||
///
|
||||
/// `mode` configures whether to use a delimiter. Without a delimiter, all keys
|
||||
/// `mode` configures whether to use a delimiter. Without a delimiter all keys
|
||||
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of
|
||||
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
|
||||
/// returned in `keys` ().
|
||||
@@ -180,32 +175,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on
|
||||
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
|
||||
///
|
||||
/// [`ListObjectsV2`]: <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>
|
||||
/// [`is_permanent`]: DownloadError::is_permanent
|
||||
fn list_streaming(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>>;
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
_mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
|
||||
let mut combined = stream.next().await.expect("At least one item required")?;
|
||||
while let Some(list) = stream.next().await {
|
||||
let list = list?;
|
||||
combined.keys.extend_from_slice(&list.keys);
|
||||
combined.prefixes.extend_from_slice(&list.prefixes);
|
||||
}
|
||||
Ok(combined)
|
||||
}
|
||||
) -> Result<Listing, DownloadError>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
///
|
||||
@@ -312,8 +288,8 @@ impl Debug for Download {
|
||||
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
|
||||
#[derive(Clone)]
|
||||
// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
|
||||
pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
|
||||
LocalFs(LocalFs),
|
||||
AwsS3(Arc<S3Bucket>),
|
||||
@@ -322,14 +298,13 @@ pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
|
||||
}
|
||||
|
||||
impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
// See [`RemoteStorage::list`].
|
||||
pub async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
|
||||
Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
|
||||
@@ -338,23 +313,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
|
||||
// See [`RemoteStorage::list_streaming`].
|
||||
pub fn list_streaming<'a>(
|
||||
&'a self,
|
||||
prefix: Option<&'a RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &'a CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
|
||||
match self {
|
||||
Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
|
||||
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
|
||||
Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||
Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||
Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::upload`]
|
||||
pub async fn upload(
|
||||
&self,
|
||||
@@ -546,16 +504,6 @@ impl GenericRemoteStorage {
|
||||
None => self.download(from, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// The name of the bucket/container/etc.
|
||||
pub fn bucket_name(&self) -> Option<&str> {
|
||||
match self {
|
||||
Self::LocalFs(_s) => None,
|
||||
Self::AwsS3(s) => Some(s.bucket_name()),
|
||||
Self::AzureBlob(s) => Some(s.container_name()),
|
||||
Self::Unreliable(_s) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
|
||||
|
||||
@@ -331,17 +331,6 @@ impl LocalFs {
|
||||
}
|
||||
|
||||
impl RemoteStorage for LocalFs {
|
||||
fn list_streaming(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> {
|
||||
let listing = self.list(prefix, mode, max_keys, cancel);
|
||||
futures::stream::once(listing)
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
|
||||
@@ -386,10 +386,6 @@ impl S3Bucket {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn bucket_name(&self) -> &str {
|
||||
&self.bucket_name
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
@@ -467,16 +463,17 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
|
||||
}
|
||||
|
||||
impl RemoteStorage for S3Bucket {
|
||||
fn list_streaming(
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> {
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let kind = RequestKind::List;
|
||||
// s3 sdk wants i32
|
||||
let mut max_keys = max_keys.map(|mk| mk.get() as i32);
|
||||
let mut result = Listing::default();
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
@@ -488,99 +485,89 @@ impl RemoteStorage for S3Bucket {
|
||||
})
|
||||
});
|
||||
|
||||
async_stream::stream! {
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let mut continuation_token = None;
|
||||
'outer: loop {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
let mut continuation_token = None;
|
||||
|
||||
// min of two Options, returning Some if one is value and another is
|
||||
// None (None is smaller than anything, so plain min doesn't work).
|
||||
let request_max_keys = self
|
||||
.max_keys_per_list_response
|
||||
.into_iter()
|
||||
.chain(max_keys.into_iter())
|
||||
.min();
|
||||
let mut request = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(list_prefix.clone())
|
||||
.set_continuation_token(continuation_token.clone())
|
||||
.set_max_keys(request_max_keys);
|
||||
loop {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
// min of two Options, returning Some if one is value and another is
|
||||
// None (None is smaller than anything, so plain min doesn't work).
|
||||
let request_max_keys = self
|
||||
.max_keys_per_list_response
|
||||
.into_iter()
|
||||
.chain(max_keys.into_iter())
|
||||
.min();
|
||||
let mut request = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(list_prefix.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.set_max_keys(request_max_keys);
|
||||
|
||||
let request = request.send();
|
||||
|
||||
let response = tokio::select! {
|
||||
res = request => Ok(res),
|
||||
_ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout),
|
||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
||||
}?;
|
||||
|
||||
let response = response
|
||||
.context("Failed to list S3 prefixes")
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
let response = match response {
|
||||
Ok(response) => response,
|
||||
Err(e) => {
|
||||
// The error is potentially retryable, so we must rewind the loop after yielding.
|
||||
yield Err(e);
|
||||
continue 'outer;
|
||||
},
|
||||
};
|
||||
|
||||
let keys = response.contents();
|
||||
let prefixes = response.common_prefixes.as_deref().unwrap_or_default();
|
||||
|
||||
tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
let mut result = Listing::default();
|
||||
|
||||
for object in keys {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||
result.keys.push(remote_path);
|
||||
if let Some(mut mk) = max_keys {
|
||||
assert!(mk > 0);
|
||||
mk -= 1;
|
||||
if mk == 0 {
|
||||
// limit reached
|
||||
yield Ok(result);
|
||||
break 'outer;
|
||||
}
|
||||
max_keys = Some(mk);
|
||||
}
|
||||
}
|
||||
|
||||
// S3 gives us prefixes like "foo/", we return them like "foo"
|
||||
result.prefixes.extend(prefixes.iter().filter_map(|o| {
|
||||
Some(
|
||||
self.s3_object_to_relative_path(
|
||||
o.prefix()?
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
|
||||
),
|
||||
)
|
||||
}));
|
||||
|
||||
yield Ok(result);
|
||||
|
||||
continuation_token = match response.next_continuation_token {
|
||||
Some(new_token) => Some(new_token),
|
||||
None => break,
|
||||
};
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
|
||||
let request = request.send();
|
||||
|
||||
let response = tokio::select! {
|
||||
res = request => res,
|
||||
_ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
|
||||
_ = cancel.cancelled() => return Err(DownloadError::Cancelled),
|
||||
};
|
||||
|
||||
let response = response
|
||||
.context("Failed to list S3 prefixes")
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
let response = response?;
|
||||
|
||||
let keys = response.contents();
|
||||
let empty = Vec::new();
|
||||
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
||||
|
||||
tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
|
||||
for object in keys {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||
result.keys.push(remote_path);
|
||||
if let Some(mut mk) = max_keys {
|
||||
assert!(mk > 0);
|
||||
mk -= 1;
|
||||
if mk == 0 {
|
||||
return Ok(result); // limit reached
|
||||
}
|
||||
max_keys = Some(mk);
|
||||
}
|
||||
}
|
||||
|
||||
// S3 gives us prefixes like "foo/", we return them like "foo"
|
||||
result.prefixes.extend(prefixes.iter().filter_map(|o| {
|
||||
Some(
|
||||
self.s3_object_to_relative_path(
|
||||
o.prefix()?
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
|
||||
),
|
||||
)
|
||||
}));
|
||||
|
||||
continuation_token = match response.next_continuation_token {
|
||||
Some(new_token) => Some(new_token),
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//! testing purposes.
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use futures::StreamExt;
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroU32;
|
||||
use std::sync::Mutex;
|
||||
@@ -108,23 +107,6 @@ impl UnreliableWrapper {
|
||||
type VoidStorage = crate::LocalFs;
|
||||
|
||||
impl RemoteStorage for UnreliableWrapper {
|
||||
fn list_streaming(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> impl Stream<Item = Result<Listing, DownloadError>> {
|
||||
async_stream::stream! {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
let mut stream = self.inner
|
||||
.list_streaming(prefix, mode, max_keys, cancel);
|
||||
while let Some(item) = stream.next().await {
|
||||
yield item;
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
|
||||
@@ -152,7 +152,7 @@ pub(crate) async fn upload_remote_data(
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
for i in 1..=upload_tasks_count {
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
let cancel = cancel.clone();
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use futures::StreamExt;
|
||||
use remote_storage::ListingMode;
|
||||
use remote_storage::RemotePath;
|
||||
use std::sync::Arc;
|
||||
@@ -30,10 +29,10 @@ use super::{
|
||||
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
|
||||
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
|
||||
///
|
||||
/// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response.
|
||||
/// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
|
||||
/// as the current default AWS S3 pagination limit is 1000.
|
||||
/// (see <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>).
|
||||
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
|
||||
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
|
||||
/// since current default AWS S3 pagination limit is 1000.
|
||||
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
|
||||
///
|
||||
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
|
||||
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
||||
@@ -88,41 +87,6 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
||||
);
|
||||
|
||||
// list_streaming
|
||||
|
||||
let prefix_with_slash = base_prefix.add_trailing_slash();
|
||||
let mut nested_remote_prefixes_st = test_client.list_streaming(
|
||||
Some(&prefix_with_slash),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
);
|
||||
let mut nested_remote_prefixes_combined = HashSet::new();
|
||||
let mut segments = 0;
|
||||
let mut segment_max_size = 0;
|
||||
while let Some(st) = nested_remote_prefixes_st.next().await {
|
||||
let st = st?;
|
||||
segment_max_size = segment_max_size.max(st.prefixes.len());
|
||||
nested_remote_prefixes_combined.extend(st.prefixes.into_iter());
|
||||
segments += 1;
|
||||
}
|
||||
assert!(segments > 1, "less than 2 segments: {segments}");
|
||||
assert!(
|
||||
segment_max_size * 2 <= nested_remote_prefixes_combined.len(),
|
||||
"double of segment_max_size={segment_max_size} larger number of remote prefixes of {}",
|
||||
nested_remote_prefixes_combined.len()
|
||||
);
|
||||
let remote_only_prefixes = nested_remote_prefixes_combined
|
||||
.difference(&expected_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
let missing_uploaded_prefixes = expected_remote_prefixes
|
||||
.difference(&nested_remote_prefixes_combined)
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
|
||||
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ bincode.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
heapless.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
|
||||
196
libs/utils/src/history_buffer.rs
Normal file
196
libs/utils/src/history_buffer.rs
Normal file
@@ -0,0 +1,196 @@
|
||||
//! A heapless buffer for events of sorts.
|
||||
|
||||
use std::ops;
|
||||
|
||||
use heapless::HistoryBuffer;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HistoryBufferWithDropCounter<T, const L: usize> {
|
||||
buffer: HistoryBuffer<T, L>,
|
||||
drop_count: u64,
|
||||
}
|
||||
|
||||
impl<T, const L: usize> HistoryBufferWithDropCounter<T, L> {
|
||||
pub fn write(&mut self, data: T) {
|
||||
let len_before = self.buffer.len();
|
||||
self.buffer.write(data);
|
||||
let len_after = self.buffer.len();
|
||||
self.drop_count += u64::from(len_before == len_after);
|
||||
}
|
||||
pub fn drop_count(&self) -> u64 {
|
||||
self.drop_count
|
||||
}
|
||||
pub fn map<U, F: Fn(&T) -> U>(&self, f: F) -> HistoryBufferWithDropCounter<U, L> {
|
||||
let mut buffer = HistoryBuffer::new();
|
||||
buffer.extend(self.buffer.oldest_ordered().map(f));
|
||||
HistoryBufferWithDropCounter::<U, L> {
|
||||
buffer,
|
||||
drop_count: self.drop_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, const L: usize> Default for HistoryBufferWithDropCounter<T, L> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
buffer: HistoryBuffer::default(),
|
||||
drop_count: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
|
||||
type Target = HistoryBuffer<T, L>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.buffer
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
struct SerdeRepr<T> {
|
||||
buffer: Vec<T>,
|
||||
buffer_size: usize,
|
||||
drop_count: u64,
|
||||
}
|
||||
|
||||
impl<'a, T, const L: usize> From<&'a HistoryBufferWithDropCounter<T, L>> for SerdeRepr<T>
|
||||
where
|
||||
T: Clone + serde::Serialize,
|
||||
{
|
||||
fn from(value: &'a HistoryBufferWithDropCounter<T, L>) -> Self {
|
||||
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
|
||||
SerdeRepr {
|
||||
buffer: buffer.iter().cloned().collect(),
|
||||
buffer_size: L,
|
||||
drop_count: *drop_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, const L: usize> serde::Serialize for HistoryBufferWithDropCounter<T, L>
|
||||
where
|
||||
T: Clone + serde::Serialize,
|
||||
{
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
SerdeRepr::from(self).serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
|
||||
where
|
||||
T: Clone + serde::Deserialize<'de>,
|
||||
{
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let SerdeRepr {
|
||||
buffer: des_buffer,
|
||||
drop_count,
|
||||
buffer_size,
|
||||
} = SerdeRepr::<T>::deserialize(deserializer)?;
|
||||
if buffer_size != L {
|
||||
use serde::de::Error;
|
||||
return Err(D::Error::custom(format!(
|
||||
"invalid buffer_size, expecting {L} got {buffer_size}"
|
||||
)));
|
||||
}
|
||||
let mut buffer = HistoryBuffer::new();
|
||||
buffer.extend(des_buffer);
|
||||
Ok(HistoryBufferWithDropCounter { buffer, drop_count })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::HistoryBufferWithDropCounter;
|
||||
|
||||
#[test]
|
||||
fn test_basics() {
|
||||
let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
|
||||
b.write(1);
|
||||
b.write(2);
|
||||
b.write(3);
|
||||
assert!(b.iter().any(|e| *e == 2));
|
||||
assert!(b.iter().any(|e| *e == 3));
|
||||
assert!(!b.iter().any(|e| *e == 1));
|
||||
|
||||
// round-trip serde
|
||||
let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
|
||||
serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
|
||||
assert_eq!(
|
||||
round_tripped.iter().cloned().collect::<Vec<_>>(),
|
||||
b.iter().cloned().collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_drop_count_works() {
|
||||
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
|
||||
b.write(1);
|
||||
assert_eq!(b.drop_count(), 0);
|
||||
b.write(2);
|
||||
assert_eq!(b.drop_count(), 0);
|
||||
b.write(3);
|
||||
assert_eq!(b.drop_count(), 1);
|
||||
b.write(4);
|
||||
assert_eq!(b.drop_count(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clone_works() {
|
||||
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
|
||||
b.write(1);
|
||||
b.write(2);
|
||||
b.write(3);
|
||||
assert_eq!(b.drop_count(), 1);
|
||||
let mut c = b.clone();
|
||||
assert_eq!(c.drop_count(), 1);
|
||||
assert!(c.iter().any(|e| *e == 2));
|
||||
assert!(c.iter().any(|e| *e == 3));
|
||||
assert!(!c.iter().any(|e| *e == 1));
|
||||
|
||||
c.write(4);
|
||||
assert!(c.iter().any(|e| *e == 4));
|
||||
assert!(!b.iter().any(|e| *e == 4));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_map() {
|
||||
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
|
||||
|
||||
b.write(1);
|
||||
assert_eq!(b.drop_count(), 0);
|
||||
{
|
||||
let c = b.map(|i| i + 10);
|
||||
assert_eq!(c.oldest_ordered().cloned().collect::<Vec<_>>(), vec![11]);
|
||||
assert_eq!(c.drop_count(), 0);
|
||||
}
|
||||
|
||||
b.write(2);
|
||||
assert_eq!(b.drop_count(), 0);
|
||||
{
|
||||
let c = b.map(|i| i + 10);
|
||||
assert_eq!(
|
||||
c.oldest_ordered().cloned().collect::<Vec<_>>(),
|
||||
vec![11, 12]
|
||||
);
|
||||
assert_eq!(c.drop_count(), 0);
|
||||
}
|
||||
|
||||
b.write(3);
|
||||
assert_eq!(b.drop_count(), 1);
|
||||
{
|
||||
let c = b.map(|i| i + 10);
|
||||
assert_eq!(
|
||||
c.oldest_ordered().cloned().collect::<Vec<_>>(),
|
||||
vec![12, 13]
|
||||
);
|
||||
assert_eq!(c.drop_count(), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -59,6 +59,8 @@ pub mod signals;
|
||||
|
||||
pub mod fs_ext;
|
||||
|
||||
pub mod history_buffer;
|
||||
|
||||
pub mod measured_stream;
|
||||
|
||||
pub mod serde_percent;
|
||||
|
||||
@@ -49,7 +49,6 @@ pub struct TenantShardId {
|
||||
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
pub const MIN: Self = Self(0);
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
|
||||
@@ -2,35 +2,30 @@
|
||||
|
||||
//! Main entry point for the Page Server executable.
|
||||
|
||||
use std::env;
|
||||
use std::env::{var, VarError};
|
||||
use std::io::Read;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{env, ops::ControlFlow, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use camino::Utf8Path;
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
|
||||
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::config::PageserverIdentity;
|
||||
use pageserver::control_plane_client::ControlPlaneClient;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use metrics::set_build_info_metric;
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
config::{defaults::*, PageServerConf},
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
deletion_queue::DeletionQueue,
|
||||
http, page_cache, page_service, task_mgr,
|
||||
@@ -89,13 +84,18 @@ fn main() -> anyhow::Result<()> {
|
||||
.with_context(|| format!("Error opening workdir '{workdir}'"))?;
|
||||
|
||||
let cfg_file_path = workdir.join("pageserver.toml");
|
||||
let identity_file_path = workdir.join("identity.toml");
|
||||
|
||||
// Set CWD to workdir for non-daemon modes
|
||||
env::set_current_dir(&workdir)
|
||||
.with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;
|
||||
|
||||
let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
|
||||
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
|
||||
ControlFlow::Continue(conf) => conf,
|
||||
ControlFlow::Break(()) => {
|
||||
info!("Pageserver config init successful");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
// Initialize logging.
|
||||
//
|
||||
@@ -150,55 +150,70 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
fn initialize_config(
|
||||
identity_file_path: &Utf8Path,
|
||||
cfg_file_path: &Utf8Path,
|
||||
arg_matches: clap::ArgMatches,
|
||||
workdir: &Utf8Path,
|
||||
) -> anyhow::Result<&'static PageServerConf> {
|
||||
// The deployment orchestrator writes out an indentity file containing the node id
|
||||
// for all pageservers. This file is the source of truth for the node id. In order
|
||||
// to allow for rolling back pageserver releases, the node id is also included in
|
||||
// the pageserver config that the deployment orchestrator writes to disk for the pageserver.
|
||||
// A rolled back version of the pageserver will get the node id from the pageserver.toml
|
||||
// config file.
|
||||
let identity = match std::fs::File::open(identity_file_path) {
|
||||
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
|
||||
let init = arg_matches.get_flag("init");
|
||||
|
||||
let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
|
||||
Ok(mut f) => {
|
||||
let md = f.metadata().context("stat config file")?;
|
||||
if !md.is_file() {
|
||||
anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ...");
|
||||
if init {
|
||||
anyhow::bail!("config file already exists: {cfg_file_path}");
|
||||
}
|
||||
|
||||
let mut s = String::new();
|
||||
f.read_to_string(&mut s).context("read identity file")?;
|
||||
toml_edit::de::from_str::<PageserverIdentity>(&s)?
|
||||
}
|
||||
Err(e) => {
|
||||
anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ...");
|
||||
}
|
||||
};
|
||||
|
||||
let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
|
||||
Ok(mut f) => {
|
||||
let md = f.metadata().context("stat config file")?;
|
||||
if md.is_file() {
|
||||
let mut s = String::new();
|
||||
f.read_to_string(&mut s).context("read config file")?;
|
||||
s.parse().context("parse config file toml")?
|
||||
Some(s.parse().context("parse config file toml")?)
|
||||
} else {
|
||||
anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
|
||||
}
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
|
||||
Err(e) => {
|
||||
anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
|
||||
}
|
||||
};
|
||||
|
||||
debug!("Using pageserver toml: {config}");
|
||||
let mut effective_config = file_contents.unwrap_or_else(|| {
|
||||
DEFAULT_CONFIG_FILE
|
||||
.parse()
|
||||
.expect("unit tests ensure this works")
|
||||
});
|
||||
|
||||
// Patch with overrides from the command line
|
||||
if let Some(values) = arg_matches.get_many::<String>("config-override") {
|
||||
for option_line in values {
|
||||
let doc = toml_edit::Document::from_str(option_line).with_context(|| {
|
||||
format!("Option '{option_line}' could not be parsed as a toml document")
|
||||
})?;
|
||||
|
||||
for (key, item) in doc.iter() {
|
||||
effective_config.insert(key, item.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Resulting toml: {effective_config}");
|
||||
|
||||
// Construct the runtime representation
|
||||
let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
|
||||
let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
|
||||
.context("Failed to parse pageserver configuration")?;
|
||||
|
||||
Ok(Box::leak(Box::new(conf)))
|
||||
if init {
|
||||
info!("Writing pageserver config to '{cfg_file_path}'");
|
||||
|
||||
std::fs::write(cfg_file_path, effective_config.to_string())
|
||||
.with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
|
||||
info!("Config successfully written to '{cfg_file_path}'")
|
||||
}
|
||||
|
||||
Ok(if init {
|
||||
ControlFlow::Break(())
|
||||
} else {
|
||||
ControlFlow::Continue(Box::leak(Box::new(conf)))
|
||||
})
|
||||
}
|
||||
|
||||
struct WaitForPhaseResult<F: std::future::Future + Unpin> {
|
||||
@@ -290,7 +305,6 @@ fn start_pageserver(
|
||||
// Create and lock PID file. This ensures that there cannot be more than one
|
||||
// pageserver process running at the same time.
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
info!("Claiming pid file at {lock_file_path:?}...");
|
||||
let lock_file =
|
||||
utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("Claimed pid file at {lock_file_path:?}");
|
||||
@@ -416,10 +430,8 @@ fn start_pageserver(
|
||||
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
let deletion_queue_client = deletion_queue.new_client();
|
||||
let background_purges = mgr::BackgroundPurges::default();
|
||||
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
conf,
|
||||
background_purges.clone(),
|
||||
TenantSharedResources {
|
||||
broker_client: broker_client.clone(),
|
||||
remote_storage: remote_storage.clone(),
|
||||
@@ -511,7 +523,7 @@ fn start_pageserver(
|
||||
}
|
||||
});
|
||||
|
||||
let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks(
|
||||
let secondary_controller = secondary::spawn_tasks(
|
||||
tenant_manager.clone(),
|
||||
remote_storage.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
@@ -524,19 +536,18 @@ fn start_pageserver(
|
||||
// been configured.
|
||||
let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
|
||||
|
||||
let disk_usage_eviction_task = launch_disk_usage_global_eviction_task(
|
||||
launch_disk_usage_global_eviction_task(
|
||||
conf,
|
||||
remote_storage.clone(),
|
||||
disk_usage_eviction_state.clone(),
|
||||
tenant_manager.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
);
|
||||
)?;
|
||||
|
||||
// Start up the service to handle HTTP mgmt API request. We created the
|
||||
// listener earlier already.
|
||||
let http_endpoint_listener = {
|
||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
|
||||
let cancel = CancellationToken::new();
|
||||
{
|
||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
|
||||
|
||||
let router_state = Arc::new(
|
||||
http::routes::State::new(
|
||||
@@ -557,44 +568,77 @@ fn start_pageserver(
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?
|
||||
.serve(service)
|
||||
.with_graceful_shutdown({
|
||||
let cancel = cancel.clone();
|
||||
async move { cancel.clone().cancelled().await }
|
||||
});
|
||||
.with_graceful_shutdown(task_mgr::shutdown_watcher());
|
||||
|
||||
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::HttpEndpointListener,
|
||||
None,
|
||||
None,
|
||||
"http endpoint listener",
|
||||
server,
|
||||
));
|
||||
HttpEndpointListener(CancellableTask { task, cancel })
|
||||
};
|
||||
true,
|
||||
async {
|
||||
server.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let consumption_metrics_tasks = {
|
||||
let cancel = shutdown_pageserver.child_token();
|
||||
let task = crate::BACKGROUND_RUNTIME.spawn({
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return; },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||
let metrics_ctx = RequestContext::todo_child(
|
||||
TaskKind::MetricsCollection,
|
||||
// This task itself shouldn't download anything.
|
||||
// The actual size calculation does need downloads, and
|
||||
// creates a child context with the right DownloadBehavior.
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
|
||||
pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await;
|
||||
}
|
||||
});
|
||||
ConsumptionMetricsTasks(CancellableTask { task, cancel })
|
||||
};
|
||||
let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
|
||||
|
||||
task_mgr::spawn(
|
||||
crate::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MetricsCollection,
|
||||
None,
|
||||
None,
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
{
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
tenant_manager,
|
||||
metric_collection_endpoint,
|
||||
&conf.metric_collection_bucket,
|
||||
conf.metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
local_disk_storage,
|
||||
cancel,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
// for each connection. We created the listener earlier already.
|
||||
let libpq_listener = {
|
||||
let cancel = CancellationToken::new();
|
||||
{
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
TaskKind::LibpqEndpointListener,
|
||||
// listener task shouldn't need to download anything. (We will
|
||||
@@ -603,20 +647,29 @@ fn start_pageserver(
|
||||
// accept connections.)
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"libpq listener",
|
||||
page_service::libpq_listener_main(
|
||||
tenant_manager.clone(),
|
||||
pg_auth,
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
),
|
||||
));
|
||||
LibpqEndpointListener(CancellableTask { task, cancel })
|
||||
};
|
||||
task_mgr::spawn(
|
||||
COMPUTE_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::LibpqEndpointListener,
|
||||
None,
|
||||
None,
|
||||
"libpq endpoint listener",
|
||||
true,
|
||||
{
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
async move {
|
||||
page_service::libpq_listener_main(
|
||||
tenant_manager,
|
||||
pg_auth,
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
libpq_ctx,
|
||||
task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||
|
||||
@@ -642,18 +695,7 @@ fn start_pageserver(
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
pageserver::shutdown_pageserver(
|
||||
http_endpoint_listener,
|
||||
libpq_listener,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
background_purges,
|
||||
deletion_queue.clone(),
|
||||
secondary_controller_tasks,
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
|
||||
unreachable!()
|
||||
})
|
||||
}
|
||||
@@ -692,12 +734,28 @@ fn cli() -> Command {
|
||||
Command::new("Neon page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(version())
|
||||
.arg(
|
||||
Arg::new("init")
|
||||
.long("init")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Initialize pageserver with all given config overrides"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("workdir")
|
||||
.short('D')
|
||||
.long("workdir")
|
||||
.help("Working directory for the pageserver"),
|
||||
)
|
||||
// See `settings.md` for more details on the extra configuration patameters pageserver can process
|
||||
.arg(
|
||||
Arg::new("config-override")
|
||||
.long("config-override")
|
||||
.short('c')
|
||||
.num_args(1)
|
||||
.action(ArgAction::Append)
|
||||
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
|
||||
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("enabled-features")
|
||||
.long("enabled-features")
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use serde;
|
||||
use serde::de::IntoDeserializer;
|
||||
use serde::{self, Deserialize};
|
||||
use std::env;
|
||||
use storage_broker::Uri;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
@@ -406,13 +406,6 @@ struct PageServerConfigBuilder {
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
fn new(node_id: NodeId) -> Self {
|
||||
let mut this = Self::default();
|
||||
this.id(node_id);
|
||||
|
||||
this
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn default_values() -> Self {
|
||||
use self::BuilderValue::*;
|
||||
@@ -888,12 +881,8 @@ impl PageServerConf {
|
||||
/// validating the input and failing on errors.
|
||||
///
|
||||
/// This leaves any options not present in the file in the built-in defaults.
|
||||
pub fn parse_and_validate(
|
||||
node_id: NodeId,
|
||||
toml: &Document,
|
||||
workdir: &Utf8Path,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut builder = PageServerConfigBuilder::new(node_id);
|
||||
pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
|
||||
let mut builder = PageServerConfigBuilder::default();
|
||||
builder.workdir(workdir.to_owned());
|
||||
|
||||
let mut t_conf = TenantConfOpt::default();
|
||||
@@ -924,8 +913,7 @@ impl PageServerConf {
|
||||
"tenant_config" => {
|
||||
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
|
||||
}
|
||||
"id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
|
||||
// Logging is not set up yet, so we can't do it.
|
||||
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
|
||||
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
||||
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
|
||||
"log_format" => builder.log_format(
|
||||
@@ -1102,12 +1090,6 @@ impl PageServerConf {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct PageserverIdentity {
|
||||
pub id: NodeId,
|
||||
}
|
||||
|
||||
// Helper functions to parse a toml Item
|
||||
|
||||
fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
|
||||
@@ -1277,7 +1259,7 @@ background_task_maximum_delay = '334 s'
|
||||
);
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
|
||||
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
|
||||
|
||||
assert_eq!(
|
||||
@@ -1359,7 +1341,7 @@ background_task_maximum_delay = '334 s'
|
||||
);
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
|
||||
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
|
||||
|
||||
assert_eq!(
|
||||
@@ -1449,13 +1431,12 @@ broker_endpoint = '{broker_endpoint}'
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_remote_storage_config =
|
||||
PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{config_string}', reason: {e:?}")
|
||||
})
|
||||
.remote_storage_config
|
||||
.expect("Should have remote storage config for the local FS");
|
||||
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{config_string}', reason: {e:?}")
|
||||
})
|
||||
.remote_storage_config
|
||||
.expect("Should have remote storage config for the local FS");
|
||||
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
@@ -1511,13 +1492,12 @@ broker_endpoint = '{broker_endpoint}'
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_remote_storage_config =
|
||||
PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{config_string}', reason: {e:?}")
|
||||
})
|
||||
.remote_storage_config
|
||||
.expect("Should have remote storage config for S3");
|
||||
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{config_string}', reason: {e:?}")
|
||||
})
|
||||
.remote_storage_config
|
||||
.expect("Should have remote storage config for S3");
|
||||
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
@@ -1596,7 +1576,7 @@ threshold = "20m"
|
||||
"#,
|
||||
);
|
||||
let toml: Document = pageserver_conf_toml.parse()?;
|
||||
let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;
|
||||
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
|
||||
|
||||
assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
|
||||
assert_eq!(
|
||||
@@ -1612,11 +1592,7 @@ threshold = "20m"
|
||||
.evictions_low_residence_duration_metric_threshold,
|
||||
Duration::from_secs(20 * 60)
|
||||
);
|
||||
|
||||
// Assert that the node id provided by the indentity file (threaded
|
||||
// through the call to [`PageServerConf::parse_and_validate`] is
|
||||
// used.
|
||||
assert_eq!(conf.id, NodeId(333));
|
||||
assert_eq!(conf.id, NodeId(222));
|
||||
assert_eq!(
|
||||
conf.disk_usage_based_eviction,
|
||||
Some(DiskUsageEvictionTaskConfig {
|
||||
@@ -1625,7 +1601,7 @@ threshold = "20m"
|
||||
period: Duration::from_secs(10),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: Default::default(),
|
||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
|
||||
})
|
||||
);
|
||||
|
||||
@@ -1661,7 +1637,7 @@ threshold = "20m"
|
||||
"#,
|
||||
);
|
||||
let toml: Document = pageserver_conf_toml.parse().unwrap();
|
||||
let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();
|
||||
let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
|
||||
|
||||
match &conf.default_tenant_conf.eviction_policy {
|
||||
EvictionPolicy::OnlyImitiate(t) => {
|
||||
@@ -1680,7 +1656,7 @@ threshold = "20m"
|
||||
remote_storage = {}
|
||||
"#;
|
||||
let doc = toml_edit::Document::from_str(input).unwrap();
|
||||
let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
|
||||
let err = PageServerConf::parse_and_validate(&doc, &workdir)
|
||||
.expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
|
||||
assert!(format!("{err}").contains("remote_storage"), "{err}");
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
//! Periodically collect consumption metrics for all active tenants
|
||||
//! and push them to a HTTP endpoint.
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::size::CalculateSyntheticSizeError;
|
||||
@@ -40,74 +39,49 @@ type RawMetric = (MetricsKey, (EventType, u64));
|
||||
/// for deduplication, but that is no longer needed.
|
||||
type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
||||
|
||||
pub async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else {
|
||||
return;
|
||||
};
|
||||
|
||||
let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
|
||||
|
||||
let metrics_ctx = RequestContext::todo_child(
|
||||
TaskKind::MetricsCollection,
|
||||
// This task itself shouldn't download anything.
|
||||
// The actual size calculation does need downloads, and
|
||||
// creates a child context with the right DownloadBehavior.
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"consumption metrics collection",
|
||||
collect_metrics(
|
||||
tenant_manager.clone(),
|
||||
metric_collection_endpoint,
|
||||
&conf.metric_collection_bucket,
|
||||
conf.metric_collection_interval,
|
||||
conf.id,
|
||||
local_disk_storage,
|
||||
cancel.clone(),
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection")),
|
||||
));
|
||||
|
||||
let worker_ctx =
|
||||
RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
|
||||
let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"synthetic size calculation",
|
||||
calculate_synthetic_size_worker(
|
||||
tenant_manager.clone(),
|
||||
conf.synthetic_size_calculation_interval,
|
||||
cancel.clone(),
|
||||
worker_ctx,
|
||||
)
|
||||
.instrument(info_span!("synthetic_size_worker")),
|
||||
));
|
||||
|
||||
let (collect_metrics, synthetic_size_worker) =
|
||||
futures::future::join(collect_metrics, synthetic_size_worker).await;
|
||||
collect_metrics
|
||||
.expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
|
||||
synthetic_size_worker
|
||||
.expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
|
||||
}
|
||||
|
||||
/// Main thread that serves metrics collection
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn collect_metrics(
|
||||
pub async fn collect_metrics(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
metric_collection_endpoint: &Url,
|
||||
metric_collection_bucket: &Option<RemoteStorageConfig>,
|
||||
metric_collection_interval: Duration,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
node_id: NodeId,
|
||||
local_disk_storage: Utf8PathBuf,
|
||||
cancel: CancellationToken,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// spin up background worker that caclulates tenant sizes
|
||||
let worker_ctx =
|
||||
ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::CalculateSyntheticSize,
|
||||
None,
|
||||
None,
|
||||
"synthetic size calculation",
|
||||
false,
|
||||
{
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
async move {
|
||||
calculate_synthetic_size_worker(
|
||||
tenant_manager,
|
||||
synthetic_size_calculation_interval,
|
||||
&cancel,
|
||||
&worker_ctx,
|
||||
)
|
||||
.instrument(info_span!("synthetic_size_worker"))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
let path: Arc<Utf8PathBuf> = Arc::new(local_disk_storage);
|
||||
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);
|
||||
|
||||
let mut cached_metrics = tokio::select! {
|
||||
@@ -194,9 +168,11 @@ async fn collect_metrics(
|
||||
BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
|
||||
);
|
||||
|
||||
let res =
|
||||
tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled())
|
||||
.await;
|
||||
let res = tokio::time::timeout_at(
|
||||
started_at + metric_collection_interval,
|
||||
task_mgr::shutdown_token().cancelled(),
|
||||
)
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -296,8 +272,8 @@ async fn reschedule(
|
||||
async fn calculate_synthetic_size_worker(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
cancel: CancellationToken,
|
||||
ctx: RequestContext,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("starting calculate_synthetic_size_worker");
|
||||
scopeguard::defer! {
|
||||
@@ -337,7 +313,7 @@ async fn calculate_synthetic_size_worker(
|
||||
// there is never any reason to exit calculate_synthetic_size_worker following any
|
||||
// return value -- we don't need to care about shutdown because no tenant is found when
|
||||
// pageserver is shut down.
|
||||
calculate_and_log(&tenant, &cancel, &ctx).await;
|
||||
calculate_and_log(&tenant, cancel, ctx).await;
|
||||
}
|
||||
|
||||
crate::tenant::tasks::warn_when_period_overrun(
|
||||
|
||||
@@ -59,14 +59,13 @@ use utils::{completion, id::TimelineId};
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::disk_usage_based_eviction::METRICS,
|
||||
task_mgr::{self, BACKGROUND_RUNTIME},
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
secondary::SecondaryTenant,
|
||||
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
|
||||
},
|
||||
CancellableTask, DiskUsageEvictionTask,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -84,9 +83,17 @@ pub struct DiskUsageEvictionTaskConfig {
|
||||
|
||||
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
|
||||
/// partitioning.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "type", content = "args")]
|
||||
pub enum EvictionOrder {
|
||||
/// Order the layers to be evicted by how recently they have been accessed in absolute
|
||||
/// time.
|
||||
///
|
||||
/// This strategy is unfair when some tenants grow faster than others towards the slower
|
||||
/// growing.
|
||||
#[default]
|
||||
AbsoluteAccessed,
|
||||
|
||||
/// Order the layers to be evicted by how recently they have been accessed relatively within
|
||||
/// the set of resident layers of a tenant.
|
||||
RelativeAccessed {
|
||||
@@ -101,14 +108,6 @@ pub enum EvictionOrder {
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for EvictionOrder {
|
||||
fn default() -> Self {
|
||||
Self::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_highest_layer_count_loses_first() -> bool {
|
||||
true
|
||||
}
|
||||
@@ -118,6 +117,11 @@ impl EvictionOrder {
|
||||
use EvictionOrder::*;
|
||||
|
||||
match self {
|
||||
AbsoluteAccessed => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.last_activity_ts)
|
||||
});
|
||||
}
|
||||
RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
}),
|
||||
@@ -130,6 +134,7 @@ impl EvictionOrder {
|
||||
use EvictionOrder::*;
|
||||
|
||||
match self {
|
||||
AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
|
||||
RelativeAccessed {
|
||||
highest_layer_count_loses_first,
|
||||
} => {
|
||||
@@ -187,34 +192,36 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
state: Arc<State>,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
background_jobs_barrier: completion::Barrier,
|
||||
) -> Option<DiskUsageEvictionTask> {
|
||||
) -> anyhow::Result<()> {
|
||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||
info!("disk usage based eviction task not configured");
|
||||
return None;
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
info!("launching disk usage based eviction task");
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::DiskUsageEviction,
|
||||
None,
|
||||
None,
|
||||
"disk usage based eviction",
|
||||
{
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
// wait until initial load is complete, because we cannot evict from loading tenants.
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return anyhow::Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => { }
|
||||
};
|
||||
false,
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel)
|
||||
.await;
|
||||
anyhow::Ok(())
|
||||
}
|
||||
// wait until initial load is complete, because we cannot evict from loading tenants.
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => { }
|
||||
};
|
||||
|
||||
disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
|
||||
Ok(())
|
||||
},
|
||||
));
|
||||
);
|
||||
|
||||
Some(DiskUsageEvictionTask(CancellableTask { cancel, task }))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
|
||||
@@ -414,7 +414,7 @@ paths:
|
||||
Either archives or unarchives the given timeline.
|
||||
An archived timeline may not have any non-archived children.
|
||||
requestBody:
|
||||
required: true
|
||||
required: false
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
|
||||
@@ -1676,10 +1676,6 @@ async fn timeline_checkpoint_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
|
||||
// By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
|
||||
let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
|
||||
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
@@ -1696,17 +1692,15 @@ async fn timeline_checkpoint_handler(
|
||||
|
||||
}
|
||||
})?;
|
||||
if compact {
|
||||
timeline
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
.map_err(|e|
|
||||
match e {
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e)
|
||||
}
|
||||
)?;
|
||||
}
|
||||
timeline
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
.map_err(|e|
|
||||
match e {
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e)
|
||||
}
|
||||
)?;
|
||||
|
||||
if wait_until_uploaded {
|
||||
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -13,7 +13,6 @@ pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod l0_flush;
|
||||
pub use pageserver_api::keyspace;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
pub mod aux_file;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
@@ -33,10 +32,7 @@ pub mod walredo;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
use tenant::{
|
||||
mgr::{BackgroundPurges, TenantManager},
|
||||
secondary,
|
||||
};
|
||||
use tenant::mgr::TenantManager;
|
||||
use tracing::info;
|
||||
|
||||
/// Current storage format version
|
||||
@@ -58,39 +54,17 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
pub use crate::metrics::preinitialize_metrics;
|
||||
|
||||
pub struct CancellableTask {
|
||||
pub task: tokio::task::JoinHandle<()>,
|
||||
pub cancel: CancellationToken,
|
||||
}
|
||||
pub struct HttpEndpointListener(pub CancellableTask);
|
||||
pub struct LibpqEndpointListener(pub CancellableTask);
|
||||
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
||||
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
||||
impl CancellableTask {
|
||||
pub async fn shutdown(self) {
|
||||
self.cancel.cancel();
|
||||
self.task.await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%exit_code))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn shutdown_pageserver(
|
||||
http_listener: HttpEndpointListener,
|
||||
libpq_listener: LibpqEndpointListener,
|
||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||
tenant_manager: &TenantManager,
|
||||
background_purges: BackgroundPurges,
|
||||
mut deletion_queue: DeletionQueue,
|
||||
secondary_controller_tasks: secondary::GlobalTasks,
|
||||
exit_code: i32,
|
||||
) {
|
||||
use std::time::Duration;
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
timed(
|
||||
libpq_listener.0.shutdown(),
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
|
||||
"shutdown LibpqEndpointListener",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
@@ -117,44 +91,16 @@ pub async fn shutdown_pageserver(
|
||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||
|
||||
timed(
|
||||
consumption_metrics_worker.0.shutdown(),
|
||||
"shutdown consumption metrics",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
timed(
|
||||
futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())),
|
||||
"shutdown disk usage eviction",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
timed(
|
||||
background_purges.shutdown(),
|
||||
"shutdown background purges",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||
timed(
|
||||
http_listener.0.shutdown(),
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
|
||||
"shutdown http",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
timed(
|
||||
secondary_controller_tasks.wait(), // cancellation happened in caller
|
||||
"secondary controller wait",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// There should be nothing left, but let's be sure
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(None, None, None),
|
||||
|
||||
@@ -124,6 +124,7 @@ pub async fn libpq_listener_main(
|
||||
None,
|
||||
None,
|
||||
"serving compute connection task",
|
||||
false,
|
||||
page_service_conn_main(
|
||||
tenant_manager.clone(),
|
||||
local_auth,
|
||||
|
||||
@@ -284,16 +284,6 @@ impl Timeline {
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||
return Ok(true);
|
||||
}
|
||||
// then check if the database was already initialized.
|
||||
// get_rel_exists can be called before dbdir is created.
|
||||
let buf = version.get(self, DBDIR_KEY, ctx).await?;
|
||||
let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.dbdirs),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}?;
|
||||
if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
|
||||
return Ok(false);
|
||||
}
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
|
||||
@@ -408,6 +408,7 @@ pub fn spawn<F>(
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
name: &str,
|
||||
shutdown_process_on_error: bool,
|
||||
future: F,
|
||||
) -> PageserverTaskId
|
||||
where
|
||||
@@ -436,6 +437,7 @@ where
|
||||
task_id,
|
||||
task_cloned,
|
||||
cancel,
|
||||
shutdown_process_on_error,
|
||||
future,
|
||||
));
|
||||
task_mut.join_handle = Some(join_handle);
|
||||
@@ -452,78 +454,82 @@ async fn task_wrapper<F>(
|
||||
task_id: u64,
|
||||
task: Arc<PageServerTask>,
|
||||
shutdown_token: CancellationToken,
|
||||
shutdown_process_on_error: bool,
|
||||
future: F,
|
||||
) where
|
||||
F: Future<Output = anyhow::Result<()>> + Send + 'static,
|
||||
{
|
||||
debug!("Starting task '{}'", task_name);
|
||||
|
||||
// wrap the future so we log panics and errors
|
||||
let tenant_shard_id = task.tenant_shard_id;
|
||||
let timeline_id = task.timeline_id;
|
||||
let fut = async move {
|
||||
// We use AssertUnwindSafe here so that the payload function
|
||||
// doesn't need to be UnwindSafe. We don't do anything after the
|
||||
// unwinding that would expose us to unwind-unsafe behavior.
|
||||
let result = AssertUnwindSafe(future).catch_unwind().await;
|
||||
let result = SHUTDOWN_TOKEN
|
||||
.scope(
|
||||
shutdown_token,
|
||||
CURRENT_TASK.scope(task, {
|
||||
// We use AssertUnwindSafe here so that the payload function
|
||||
// doesn't need to be UnwindSafe. We don't do anything after the
|
||||
// unwinding that would expose us to unwind-unsafe behavior.
|
||||
AssertUnwindSafe(future).catch_unwind()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
task_finish(result, task_name, task_id, shutdown_process_on_error).await;
|
||||
}
|
||||
|
||||
async fn task_finish(
|
||||
result: std::result::Result<
|
||||
anyhow::Result<()>,
|
||||
std::boxed::Box<dyn std::any::Any + std::marker::Send>,
|
||||
>,
|
||||
task_name: String,
|
||||
task_id: u64,
|
||||
shutdown_process_on_error: bool,
|
||||
) {
|
||||
// Remove our entry from the global hashmap.
|
||||
let task = TASKS
|
||||
.lock()
|
||||
.unwrap()
|
||||
.remove(&task_id)
|
||||
.expect("no task in registry");
|
||||
|
||||
let mut shutdown_process = false;
|
||||
{
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
debug!("Task '{}' exited normally", task_name);
|
||||
}
|
||||
Ok(Err(err)) => {
|
||||
error!(
|
||||
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, tenant_shard_id, timeline_id, err
|
||||
);
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, task.tenant_shard_id, task.timeline_id, err
|
||||
);
|
||||
shutdown_process = true;
|
||||
} else {
|
||||
error!(
|
||||
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, task.tenant_shard_id, task.timeline_id, err
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, tenant_shard_id, timeline_id, err
|
||||
);
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, task.tenant_shard_id, task.timeline_id, err
|
||||
);
|
||||
shutdown_process = true;
|
||||
} else {
|
||||
error!(
|
||||
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, task.tenant_shard_id, task.timeline_id, err
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// add the task-locals
|
||||
let fut = CURRENT_TASK.scope(task, fut);
|
||||
let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut);
|
||||
|
||||
// poll future to completion
|
||||
fut.await;
|
||||
|
||||
// Remove our entry from the global hashmap.
|
||||
TASKS
|
||||
.lock()
|
||||
.unwrap()
|
||||
.remove(&task_id)
|
||||
.expect("no task in registry");
|
||||
}
|
||||
|
||||
pub async fn exit_on_panic_or_error<T, E>(
|
||||
task_name: &'static str,
|
||||
future: impl Future<Output = Result<T, E>>,
|
||||
) -> T
|
||||
where
|
||||
E: std::fmt::Debug,
|
||||
{
|
||||
// We use AssertUnwindSafe here so that the payload function
|
||||
// doesn't need to be UnwindSafe. We don't do anything after the
|
||||
// unwinding that would expose us to unwind-unsafe behavior.
|
||||
let result = AssertUnwindSafe(future).catch_unwind().await;
|
||||
match result {
|
||||
Ok(Ok(val)) => val,
|
||||
Ok(Err(err)) => {
|
||||
error!(
|
||||
task_name,
|
||||
"Task exited with error, exiting process: {err:?}"
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
Err(panic_obj) => {
|
||||
error!(task_name, "Task panicked, exiting process: {panic_obj:?}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
if shutdown_process {
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -31,7 +31,6 @@ use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
@@ -96,12 +95,14 @@ use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::walredo;
|
||||
use crate::InitializationOrder;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::Debug;
|
||||
use std::fmt::Display;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::ops::Bound::Included;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
@@ -720,6 +721,7 @@ impl Tenant {
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
"attach tenant",
|
||||
false,
|
||||
async move {
|
||||
|
||||
info!(
|
||||
@@ -1764,9 +1766,6 @@ impl Tenant {
|
||||
.values()
|
||||
.filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));
|
||||
|
||||
// Before activation, populate each Timeline's GcInfo with information about its children
|
||||
self.initialize_gc_info(&timelines_accessor);
|
||||
|
||||
// Spawn gc and compaction loops. The loops will shut themselves
|
||||
// down when they notice that the tenant is inactive.
|
||||
tasks::start_background_loops(self, background_jobs_can_start);
|
||||
@@ -2800,55 +2799,6 @@ impl Tenant {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Populate all Timelines' `GcInfo` with information about their children. We do not set the
|
||||
/// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`]
|
||||
///
|
||||
/// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion.
|
||||
fn initialize_gc_info(
|
||||
&self,
|
||||
timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
|
||||
) {
|
||||
// This function must be called before activation: after activation timeline create/delete operations
|
||||
// might happen, and this function is not safe to run concurrently with those.
|
||||
assert!(!self.is_active());
|
||||
|
||||
// Scan all timelines. For each timeline, remember the timeline ID and
|
||||
// the branch point where it was created.
|
||||
let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> = BTreeMap::new();
|
||||
timelines.iter().for_each(|(timeline_id, timeline_entry)| {
|
||||
if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
|
||||
let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
|
||||
ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id));
|
||||
}
|
||||
});
|
||||
|
||||
// The number of bytes we always keep, irrespective of PITR: this is a constant across timelines
|
||||
let horizon = self.get_gc_horizon();
|
||||
|
||||
// Populate each timeline's GcInfo with information about its child branches
|
||||
for timeline in timelines.values() {
|
||||
let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
|
||||
.remove(&timeline.timeline_id)
|
||||
.unwrap_or_default();
|
||||
|
||||
branchpoints.sort_by_key(|b| b.0);
|
||||
|
||||
let mut target = timeline.gc_info.write().unwrap();
|
||||
|
||||
target.retain_lsns = branchpoints;
|
||||
|
||||
let space_cutoff = timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(horizon)
|
||||
.unwrap_or(Lsn(0));
|
||||
|
||||
target.cutoffs = GcCutoffs {
|
||||
space: space_cutoff,
|
||||
time: Lsn::INVALID,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async fn refresh_gc_info_internal(
|
||||
&self,
|
||||
target_timeline_id: Option<TimelineId>,
|
||||
@@ -2871,11 +2821,6 @@ impl Tenant {
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if target_timeline_id.is_some() && timelines.is_empty() {
|
||||
// We were to act on a particular timeline and it wasn't found
|
||||
return Err(GcError::TimelineNotFound);
|
||||
}
|
||||
|
||||
let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
|
||||
HashMap::with_capacity(timelines.len());
|
||||
|
||||
@@ -2898,63 +2843,68 @@ impl Tenant {
|
||||
// because that will stall branch creation.
|
||||
let gc_cs = self.gc_cs.lock().await;
|
||||
|
||||
// Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
|
||||
// depend on. So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
|
||||
// and fail out if it's inaccurate.
|
||||
// (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
|
||||
{
|
||||
let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
|
||||
BTreeMap::new();
|
||||
timelines.iter().for_each(|timeline| {
|
||||
if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
|
||||
let ancestor_children =
|
||||
all_branchpoints.entry(*ancestor_timeline_id).or_default();
|
||||
ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
|
||||
}
|
||||
});
|
||||
|
||||
for timeline in &timelines {
|
||||
let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
|
||||
.remove(&timeline.timeline_id)
|
||||
.unwrap_or_default();
|
||||
|
||||
branchpoints.sort_by_key(|b| b.0);
|
||||
|
||||
let target = timeline.gc_info.read().unwrap();
|
||||
|
||||
// We require that retain_lsns contains everything in `branchpoints`, but not that
|
||||
// they are exactly equal: timeline deletions can race with us, so retain_lsns
|
||||
// may contain some extra stuff. It is safe to have extra timelines in there, because it
|
||||
// just means that we retain slightly more data than we otherwise might.
|
||||
let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
|
||||
for b in &branchpoints {
|
||||
if !have_branchpoints.contains(b) {
|
||||
tracing::error!(
|
||||
"Bug: `retain_lsns` is set incorrectly. Expected be {:?}, but found {:?}",
|
||||
branchpoints,
|
||||
target.retain_lsns
|
||||
);
|
||||
debug_assert!(false);
|
||||
// Do not GC based on bad information!
|
||||
// (ab-use an existing GcError type rather than adding a new one, since this is a
|
||||
// "should never happen" check that will be removed soon).
|
||||
return Err(GcError::Remote(anyhow::anyhow!(
|
||||
"retain_lsns failed validation!"
|
||||
)));
|
||||
// Scan all timelines. For each timeline, remember the timeline ID and
|
||||
// the branch point where it was created.
|
||||
let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let mut all_branchpoints = BTreeSet::new();
|
||||
let timelines = {
|
||||
if let Some(target_timeline_id) = target_timeline_id.as_ref() {
|
||||
if timelines.get(target_timeline_id).is_none() {
|
||||
return Err(GcError::TimelineNotFound);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
timelines
|
||||
.iter()
|
||||
.map(|(_timeline_id, timeline_entry)| {
|
||||
if let Some(ancestor_timeline_id) =
|
||||
&timeline_entry.get_ancestor_timeline_id()
|
||||
{
|
||||
// If target_timeline is specified, we only need to know branchpoints of its children
|
||||
if let Some(timeline_id) = target_timeline_id {
|
||||
if ancestor_timeline_id == &timeline_id {
|
||||
all_branchpoints.insert((
|
||||
*ancestor_timeline_id,
|
||||
timeline_entry.get_ancestor_lsn(),
|
||||
));
|
||||
}
|
||||
}
|
||||
// Collect branchpoints for all timelines
|
||||
else {
|
||||
all_branchpoints.insert((
|
||||
*ancestor_timeline_id,
|
||||
timeline_entry.get_ancestor_lsn(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
timeline_entry.clone()
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
(all_branchpoints, timelines)
|
||||
};
|
||||
|
||||
// Ok, we now know all the branch points.
|
||||
// Update the GC information for each timeline.
|
||||
let mut gc_timelines = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
// We filtered the timeline list above
|
||||
// If target_timeline is specified, ignore all other timelines
|
||||
if let Some(target_timeline_id) = target_timeline_id {
|
||||
assert_eq!(target_timeline_id, timeline.timeline_id);
|
||||
if timeline.timeline_id != target_timeline_id {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||
.range((
|
||||
Included((timeline.timeline_id, Lsn(0))),
|
||||
Included((timeline.timeline_id, Lsn(u64::MAX))),
|
||||
))
|
||||
.map(|&x| x.1)
|
||||
.collect();
|
||||
|
||||
{
|
||||
let mut target = timeline.gc_info.write().unwrap();
|
||||
|
||||
@@ -2992,12 +2942,20 @@ impl Tenant {
|
||||
.0,
|
||||
);
|
||||
|
||||
// Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline?
|
||||
// - this timeline was created while we were finding cutoffs
|
||||
// - lsn for timestamp search fails for this timeline repeatedly
|
||||
if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
|
||||
target.cutoffs = cutoffs.clone();
|
||||
}
|
||||
match gc_cutoffs.remove(&timeline.timeline_id) {
|
||||
Some(cutoffs) => {
|
||||
target.retain_lsns = branchpoints;
|
||||
target.cutoffs = cutoffs;
|
||||
}
|
||||
None => {
|
||||
// reasons for this being unavailable:
|
||||
// - this timeline was created while we were finding cutoffs
|
||||
// - lsn for timestamp search fails for this timeline repeatedly
|
||||
//
|
||||
// in both cases, refreshing the branchpoints is correct.
|
||||
target.retain_lsns = branchpoints;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
gc_timelines.push(timeline);
|
||||
@@ -4035,7 +3993,6 @@ mod tests {
|
||||
use storage_layer::PersistentLayerKey;
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
use timeline::{DeltaLayerTestDesc, GcInfo};
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::TenantId;
|
||||
@@ -4386,7 +4343,7 @@ mod tests {
|
||||
{
|
||||
let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
|
||||
assert_eq!(branchpoints.len(), 1);
|
||||
assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID));
|
||||
assert_eq!(branchpoints[0], Lsn(0x40));
|
||||
}
|
||||
|
||||
// You can read the key from the child branch even though the parent is
|
||||
@@ -7258,323 +7215,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_generate_key_retention() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_generate_key_retention").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
tline.force_advance_lsn(Lsn(0x70));
|
||||
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let history = vec![
|
||||
(
|
||||
key,
|
||||
Lsn(0x10),
|
||||
Value::Image(Bytes::copy_from_slice(b"0x10")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x50),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x60),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x70),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x80),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x80")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x90),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
|
||||
),
|
||||
];
|
||||
let res = tline
|
||||
.generate_key_retention(
|
||||
key,
|
||||
&history,
|
||||
Lsn(0x60),
|
||||
&[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
|
||||
3,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let expected_res = KeyHistoryRetention {
|
||||
below_horizon: vec![
|
||||
(
|
||||
Lsn(0x20),
|
||||
KeyLogAtLsn(vec![(
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
|
||||
)]),
|
||||
),
|
||||
(
|
||||
Lsn(0x40),
|
||||
KeyLogAtLsn(vec![
|
||||
(
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
|
||||
),
|
||||
(
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
|
||||
),
|
||||
]),
|
||||
),
|
||||
(
|
||||
Lsn(0x50),
|
||||
KeyLogAtLsn(vec![(
|
||||
Lsn(0x50),
|
||||
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40;0x50")),
|
||||
)]),
|
||||
),
|
||||
(
|
||||
Lsn(0x60),
|
||||
KeyLogAtLsn(vec![(
|
||||
Lsn(0x60),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
|
||||
)]),
|
||||
),
|
||||
],
|
||||
above_horizon: KeyLogAtLsn(vec![
|
||||
(
|
||||
Lsn(0x70),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||
),
|
||||
(
|
||||
Lsn(0x80),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x80")),
|
||||
),
|
||||
(
|
||||
Lsn(0x90),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
|
||||
),
|
||||
]),
|
||||
};
|
||||
assert_eq!(res, expected_res);
|
||||
// TODO: more tests with mixed image + delta, adding with k-merge test cases; e2e compaction test
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
|
||||
let harness =
|
||||
TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x28),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(6),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![
|
||||
(Lsn(0x10), tline.timeline_id),
|
||||
(Lsn(0x20), tline.timeline_id),
|
||||
],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let expected_result = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10@0x30"),
|
||||
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||
];
|
||||
|
||||
let expected_result_at_gc_horizon = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10@0x30"),
|
||||
Bytes::from_static(b"value 3@0x10@0x28@0x30"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_20 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_10 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let verify_result = || async {
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x30), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x20), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_20[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x10), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_10[idx]
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
|
||||
verify_result().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -463,7 +463,7 @@ impl LayerMap {
|
||||
pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
|
||||
|
||||
if Self::is_l0(&layer_desc.key_range) {
|
||||
if Self::is_l0(&layer_desc) {
|
||||
self.l0_delta_layers.push(layer_desc.clone().into());
|
||||
}
|
||||
|
||||
@@ -482,7 +482,7 @@ impl LayerMap {
|
||||
self.historic
|
||||
.remove(historic_layer_coverage::LayerKey::from(layer_desc));
|
||||
let layer_key = layer_desc.key();
|
||||
if Self::is_l0(&layer_desc.key_range) {
|
||||
if Self::is_l0(layer_desc) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
|
||||
l0_delta_layers.retain(|other| other.key() != layer_key);
|
||||
@@ -598,9 +598,8 @@ impl LayerMap {
|
||||
coverage
|
||||
}
|
||||
|
||||
/// Check if the key range resembles that of an L0 layer.
|
||||
pub fn is_l0(key_range: &Range<Key>) -> bool {
|
||||
key_range == &(Key::MIN..Key::MAX)
|
||||
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
|
||||
layer.get_key_range() == (Key::MIN..Key::MAX)
|
||||
}
|
||||
|
||||
/// This function determines which layers are counted in `count_deltas`:
|
||||
@@ -627,7 +626,7 @@ impl LayerMap {
|
||||
/// than just the current partition_range.
|
||||
pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
|
||||
// Case 1
|
||||
if !Self::is_l0(&layer.key_range) {
|
||||
if !Self::is_l0(layer) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ use crate::control_plane_client::{
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
};
|
||||
@@ -225,98 +225,26 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
Ok(tmp_path)
|
||||
}
|
||||
|
||||
/// See [`Self::spawn`].
|
||||
#[derive(Clone)]
|
||||
pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
|
||||
enum BackgroundPurgesInner {
|
||||
Open(tokio::task::JoinSet<()>),
|
||||
// we use the async mutex for coalescing
|
||||
ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
|
||||
}
|
||||
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
|
||||
/// the background, and thereby avoid blocking any API requests on this deletion completing.
|
||||
fn spawn_background_purge(tmp_path: Utf8PathBuf) {
|
||||
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
|
||||
let task_tenant_id = None;
|
||||
|
||||
impl Default for BackgroundPurges {
|
||||
fn default() -> Self {
|
||||
Self(Arc::new(std::sync::Mutex::new(
|
||||
BackgroundPurgesInner::Open(JoinSet::new()),
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
impl BackgroundPurges {
|
||||
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
|
||||
/// the background, and thereby avoid blocking any API requests on this deletion completing.
|
||||
///
|
||||
/// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||
/// Thus the [`BackgroundPurges`] type to keep track of these tasks.
|
||||
pub fn spawn(&self, tmp_path: Utf8PathBuf) {
|
||||
let mut guard = self.0.lock().unwrap();
|
||||
let jset = match &mut *guard {
|
||||
BackgroundPurgesInner::Open(ref mut jset) => jset,
|
||||
BackgroundPurgesInner::ShuttingDown(_) => {
|
||||
warn!("trying to spawn background purge during shutdown, ignoring");
|
||||
return;
|
||||
}
|
||||
};
|
||||
jset.spawn_on(
|
||||
async move {
|
||||
if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
|
||||
// should we fatal_io_error here?
|
||||
warn!(%error, path=%tmp_path, "failed to purge tenant directory");
|
||||
}
|
||||
}
|
||||
.instrument(info_span!(parent: None, "background_purge")),
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
);
|
||||
}
|
||||
|
||||
/// When this future completes, all background purges have completed.
|
||||
/// The first poll of the future will already lock out new background purges spawned via [`Self::spawn`].
|
||||
///
|
||||
/// Concurrent calls will coalesce.
|
||||
///
|
||||
/// # Cancellation-Safety
|
||||
///
|
||||
/// If this future is dropped before polled to completion, concurrent and subsequent
|
||||
/// instances of this future will continue to be correct.
|
||||
#[instrument(skip_all)]
|
||||
pub async fn shutdown(&self) {
|
||||
let jset = {
|
||||
let mut guard = self.0.lock().unwrap();
|
||||
match &mut *guard {
|
||||
BackgroundPurgesInner::Open(jset) => {
|
||||
*guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
|
||||
std::mem::take(jset),
|
||||
)))
|
||||
}
|
||||
BackgroundPurgesInner::ShuttingDown(_) => {
|
||||
// calling shutdown multiple times is most likely a bug in pageserver shutdown code
|
||||
warn!("already shutting down");
|
||||
}
|
||||
};
|
||||
match &mut *guard {
|
||||
BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
|
||||
BackgroundPurgesInner::Open(_) => {
|
||||
unreachable!("above code transitions into shut down state");
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut jset = jset.lock().await; // concurrent callers coalesce here
|
||||
while let Some(res) = jset.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.is_panic() => {
|
||||
// If it panicked, the error is already logged by the panic hook.
|
||||
}
|
||||
Err(e) if e.is_cancelled() => {
|
||||
unreachable!("we don't cancel the joinset or runtime")
|
||||
}
|
||||
Err(e) => {
|
||||
// No idea when this can happen, but let's log it.
|
||||
warn!(%e, "background purge task failed or panicked");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
task_tenant_id,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
|
||||
@@ -342,8 +270,6 @@ pub struct TenantManager {
|
||||
// tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
|
||||
// when the tenant detaches.
|
||||
cancel: CancellationToken,
|
||||
|
||||
background_purges: BackgroundPurges,
|
||||
}
|
||||
|
||||
fn emergency_generations(
|
||||
@@ -521,7 +447,6 @@ pub(crate) enum DeleteTenantError {
|
||||
#[instrument(skip_all)]
|
||||
pub async fn init_tenant_mgr(
|
||||
conf: &'static PageServerConf,
|
||||
background_purges: BackgroundPurges,
|
||||
resources: TenantSharedResources,
|
||||
init_order: InitializationOrder,
|
||||
cancel: CancellationToken,
|
||||
@@ -587,7 +512,7 @@ pub async fn init_tenant_mgr(
|
||||
|
||||
match safe_rename_tenant_dir(&tenant_dir_path).await {
|
||||
Ok(tmp_path) => {
|
||||
background_purges.spawn(tmp_path);
|
||||
spawn_background_purge(tmp_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
@@ -709,7 +634,6 @@ pub async fn init_tenant_mgr(
|
||||
tenants: &TENANTS,
|
||||
resources,
|
||||
cancel: CancellationToken::new(),
|
||||
background_purges,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1429,7 +1353,6 @@ impl TenantManager {
|
||||
|
||||
async fn delete_local(
|
||||
conf: &PageServerConf,
|
||||
background_purges: &BackgroundPurges,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> anyhow::Result<()> {
|
||||
let local_tenant_directory = conf.tenant_path(tenant_shard_id);
|
||||
@@ -1438,7 +1361,7 @@ impl TenantManager {
|
||||
.with_context(|| {
|
||||
format!("local tenant directory {local_tenant_directory:?} rename")
|
||||
})?;
|
||||
background_purges.spawn(tmp_dir);
|
||||
spawn_background_purge(tmp_dir);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1456,12 +1379,12 @@ impl TenantManager {
|
||||
barrier.wait().await;
|
||||
}
|
||||
}
|
||||
delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?;
|
||||
delete_local(self.conf, &tenant_shard_id).await?;
|
||||
}
|
||||
Some(TenantSlot::Secondary(secondary_tenant)) => {
|
||||
secondary_tenant.shutdown().await;
|
||||
|
||||
delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?;
|
||||
delete_local(self.conf, &tenant_shard_id).await?;
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => unreachable!(),
|
||||
None => {}
|
||||
@@ -1732,7 +1655,7 @@ impl TenantManager {
|
||||
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
|
||||
self.background_purges.spawn(tmp_path);
|
||||
spawn_background_purge(tmp_path);
|
||||
|
||||
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
@@ -1908,7 +1831,7 @@ impl TenantManager {
|
||||
let tmp_path = self
|
||||
.detach_tenant0(conf, tenant_shard_id, deletion_queue_client)
|
||||
.await?;
|
||||
self.background_purges.spawn(tmp_path);
|
||||
spawn_background_purge(tmp_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1525,6 +1525,7 @@ impl RemoteTimelineClient {
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
"remote upload",
|
||||
false,
|
||||
async move {
|
||||
self_rc.perform_upload_task(task).await;
|
||||
Ok(())
|
||||
|
||||
@@ -31,7 +31,6 @@ use pageserver_api::{
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::instrument;
|
||||
use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
|
||||
@@ -294,50 +293,15 @@ impl SecondaryController {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GlobalTasks {
|
||||
cancel: CancellationToken,
|
||||
uploader: JoinHandle<()>,
|
||||
downloader: JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl GlobalTasks {
|
||||
/// Caller is responsible for requesting shutdown via the cancellation token that was
|
||||
/// passed to [`spawn_tasks`].
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This method panics if that token is not cancelled.
|
||||
/// This is low-risk because we're calling this during process shutdown, so, a panic
|
||||
/// will be informative but not cause undue downtime.
|
||||
pub async fn wait(self) {
|
||||
let Self {
|
||||
cancel,
|
||||
uploader,
|
||||
downloader,
|
||||
} = self;
|
||||
assert!(
|
||||
cancel.is_cancelled(),
|
||||
"must cancel cancellation token, otherwise the tasks will not shut down"
|
||||
);
|
||||
|
||||
let (uploader, downloader) = futures::future::join(uploader, downloader).await;
|
||||
uploader.expect(
|
||||
"unreachable: exit_on_panic_or_error would catch the panic and exit the process",
|
||||
);
|
||||
downloader.expect(
|
||||
"unreachable: exit_on_panic_or_error would catch the panic and exit the process",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn_tasks(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
background_jobs_can_start: Barrier,
|
||||
cancel: CancellationToken,
|
||||
) -> (SecondaryController, GlobalTasks) {
|
||||
) -> SecondaryController {
|
||||
let mgr_clone = tenant_manager.clone();
|
||||
let storage_clone = remote_storage.clone();
|
||||
let cancel_clone = cancel.clone();
|
||||
let bg_jobs_clone = background_jobs_can_start.clone();
|
||||
|
||||
let (download_req_tx, download_req_rx) =
|
||||
@@ -345,9 +309,17 @@ pub fn spawn_tasks(
|
||||
let (upload_req_tx, upload_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
||||
|
||||
let cancel_clone = cancel.clone();
|
||||
let downloader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
let downloader_task_ctx = RequestContext::new(
|
||||
TaskKind::SecondaryDownloads,
|
||||
crate::context::DownloadBehavior::Download,
|
||||
);
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
downloader_task_ctx.task_kind(),
|
||||
None,
|
||||
None,
|
||||
"secondary tenant downloads",
|
||||
false,
|
||||
async move {
|
||||
downloader_task(
|
||||
mgr_clone,
|
||||
@@ -355,41 +327,49 @@ pub fn spawn_tasks(
|
||||
download_req_rx,
|
||||
bg_jobs_clone,
|
||||
cancel_clone,
|
||||
RequestContext::new(
|
||||
TaskKind::SecondaryDownloads,
|
||||
crate::context::DownloadBehavior::Download,
|
||||
),
|
||||
downloader_task_ctx,
|
||||
)
|
||||
.await;
|
||||
anyhow::Ok(())
|
||||
},
|
||||
));
|
||||
|
||||
let cancel_clone = cancel.clone();
|
||||
let uploader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::SecondaryUploads,
|
||||
None,
|
||||
None,
|
||||
"heatmap uploads",
|
||||
false,
|
||||
async move {
|
||||
heatmap_uploader_task(
|
||||
tenant_manager,
|
||||
remote_storage,
|
||||
upload_req_rx,
|
||||
background_jobs_can_start,
|
||||
cancel_clone,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
anyhow::Ok(())
|
||||
},
|
||||
));
|
||||
|
||||
(
|
||||
SecondaryController {
|
||||
upload_req_tx,
|
||||
download_req_tx,
|
||||
Ok(())
|
||||
},
|
||||
GlobalTasks {
|
||||
cancel,
|
||||
uploader,
|
||||
downloader,
|
||||
},
|
||||
)
|
||||
);
|
||||
|
||||
SecondaryController {
|
||||
download_req_tx,
|
||||
upload_req_tx,
|
||||
}
|
||||
}
|
||||
|
||||
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
|
||||
pub fn null_controller() -> SecondaryController {
|
||||
let (download_req_tx, _download_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
|
||||
let (upload_req_tx, _upload_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
||||
SecondaryController {
|
||||
upload_req_tx,
|
||||
download_req_tx,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -264,10 +264,10 @@ pub(super) async fn gather_inputs(
|
||||
let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
|
||||
.retain_lsns
|
||||
.iter()
|
||||
.filter(|(lsn, _child_id)| lsn > &ancestor_lsn)
|
||||
.filter(|&&lsn| lsn > ancestor_lsn)
|
||||
.copied()
|
||||
// this assumes there are no other retain_lsns than the branchpoints
|
||||
.map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint))
|
||||
.map(|lsn| (lsn, LsnKind::BranchPoint))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
|
||||
|
||||
@@ -10,18 +10,29 @@ pub mod merge_iterator;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use enum_map::EnumMap;
|
||||
use enumset::EnumSet;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::models::{
|
||||
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
use tracing::warn;
|
||||
use utils::history_buffer::HistoryBufferWithDropCounter;
|
||||
use utils::rate_limit::RateLimit;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
|
||||
pub use image_layer::{ImageLayer, ImageLayerWriter};
|
||||
@@ -64,9 +75,9 @@ where
|
||||
/// call, to collect more records.
|
||||
///
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct ValueReconstructState {
|
||||
pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub(crate) img: Option<(Lsn, Bytes)>,
|
||||
pub struct ValueReconstructState {
|
||||
pub records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub img: Option<(Lsn, Bytes)>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
|
||||
@@ -447,92 +458,94 @@ pub enum ValueReconstructResult {
|
||||
Missing,
|
||||
}
|
||||
|
||||
/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather
|
||||
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
|
||||
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
|
||||
/// be used for cache management but not for correctness-critical checks.
|
||||
#[derive(Default, Debug, Clone, PartialEq, Eq)]
|
||||
pub(crate) enum LayerVisibilityHint {
|
||||
/// A Visible layer might be read while serving a read, because there is not an image layer between it
|
||||
/// and a readable LSN (the tip of the branch or a child's branch point)
|
||||
Visible,
|
||||
/// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
|
||||
/// a branch or ephemeral endpoint at an LSN below the layer that covers this.
|
||||
#[allow(unused)]
|
||||
Covered,
|
||||
/// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
|
||||
/// in this state. Note that newly written layers may be called Visible immediately, this uninitialized
|
||||
/// state is for when existing layers are constructed while loading a timeline.
|
||||
#[default]
|
||||
Uninitialized,
|
||||
#[derive(Debug)]
|
||||
pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
|
||||
|
||||
/// This struct holds two instances of [`LayerAccessStatsInner`].
|
||||
/// Accesses are recorded to both instances.
|
||||
/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
|
||||
/// The `for_eviction_policy` is never reset.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct LayerAccessStatsLocked {
|
||||
for_scraping_api: LayerAccessStatsInner,
|
||||
for_eviction_policy: LayerAccessStatsInner,
|
||||
}
|
||||
|
||||
pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
|
||||
impl LayerAccessStatsLocked {
|
||||
fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
|
||||
[&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct LayerAccessStatsInner {
|
||||
first_access: Option<LayerAccessStatFullDetails>,
|
||||
count_by_access_kind: EnumMap<LayerAccessKind, u64>,
|
||||
task_kind_flag: EnumSet<TaskKind>,
|
||||
last_accesses: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
|
||||
last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) struct LayerAccessStatFullDetails {
|
||||
pub(crate) when: SystemTime,
|
||||
pub(crate) task_kind: TaskKind,
|
||||
pub(crate) access_kind: LayerAccessKind,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, strum_macros::EnumString)]
|
||||
pub(crate) enum LayerAccessStatsReset {
|
||||
pub enum LayerAccessStatsReset {
|
||||
NoReset,
|
||||
JustTaskKindFlags,
|
||||
AllStats,
|
||||
}
|
||||
|
||||
impl Default for LayerAccessStats {
|
||||
fn default() -> Self {
|
||||
// Default value is to assume resident since creation time, and visible.
|
||||
let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now());
|
||||
value |= 0x1 << Self::VISIBILITY_SHIFT;
|
||||
fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
|
||||
ts.duration_since(UNIX_EPOCH)
|
||||
.expect("better to die in this unlikely case than report false stats")
|
||||
.as_millis()
|
||||
.try_into()
|
||||
.expect("64 bits is enough for few more years")
|
||||
}
|
||||
|
||||
Self(std::sync::atomic::AtomicU64::new(value))
|
||||
impl LayerAccessStatFullDetails {
|
||||
fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
|
||||
let Self {
|
||||
when,
|
||||
task_kind,
|
||||
access_kind,
|
||||
} = self;
|
||||
pageserver_api::models::LayerAccessStatFullDetails {
|
||||
when_millis_since_epoch: system_time_to_millis_since_epoch(when),
|
||||
task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
|
||||
access_kind: *access_kind,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Efficient store of two very-low-resolution timestamps and some bits. Used for storing last access time and
|
||||
// last residence change time.
|
||||
impl LayerAccessStats {
|
||||
// How many high bits to drop from a u32 timestamp?
|
||||
// - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use
|
||||
// after that, this software has been very successful!)
|
||||
// - Dropping the top bit is implicitly safe because unix timestamps are meant to be
|
||||
// stored in an i32, so they never used it.
|
||||
// - Dropping the next two bits is safe because this code is only running on systems in
|
||||
// years >= 2024, and these bits have been 1 since 2021
|
||||
//
|
||||
// Therefore we may store only 28 bits for a timestamp with one second resolution. We do
|
||||
// this truncation to make space for some flags in the high bits of our u64.
|
||||
const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1;
|
||||
const TS_MASK: u32 = 0x1f_ff_ff_ff;
|
||||
const TS_ONES: u32 = 0x60_00_00_00;
|
||||
|
||||
const ATIME_SHIFT: u32 = 0;
|
||||
const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS;
|
||||
const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS;
|
||||
|
||||
fn write_bits(&self, mask: u64, value: u64) -> u64 {
|
||||
self.0
|
||||
.fetch_update(
|
||||
// TODO: decide what orderings are correct
|
||||
std::sync::atomic::Ordering::Relaxed,
|
||||
std::sync::atomic::Ordering::Relaxed,
|
||||
|v| Some((v & !mask) | (value & mask)),
|
||||
)
|
||||
.expect("Inner function is infallible")
|
||||
/// Create an empty stats object.
|
||||
///
|
||||
/// The caller is responsible for recording a residence event
|
||||
/// using [`record_residence_event`] before calling `latest_activity`.
|
||||
/// If they don't, [`latest_activity`] will return `None`.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
/// [`latest_activity`]: Self::latest_activity
|
||||
pub(crate) fn empty_will_record_residence_event_later() -> Self {
|
||||
LayerAccessStats(Mutex::default())
|
||||
}
|
||||
|
||||
fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) {
|
||||
// Drop the low three bits of the timestamp, for an ~8s accuracy
|
||||
let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64);
|
||||
|
||||
((Self::TS_MASK as u64) << shift, timestamp << shift)
|
||||
}
|
||||
|
||||
fn read_low_res_timestamp(&self, shift: u32) -> Option<SystemTime> {
|
||||
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift;
|
||||
if ts_bits == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64)))
|
||||
}
|
||||
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
///
|
||||
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
|
||||
new
|
||||
}
|
||||
|
||||
/// Record a change in layer residency.
|
||||
@@ -548,64 +561,117 @@ impl LayerAccessStats {
|
||||
/// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
|
||||
/// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
|
||||
/// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
|
||||
pub(crate) fn record_residence_event_at(&self, now: SystemTime) {
|
||||
let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now);
|
||||
self.write_bits(mask, value);
|
||||
///
|
||||
pub(crate) fn record_residence_event(
|
||||
&self,
|
||||
status: LayerResidenceStatus,
|
||||
reason: LayerResidenceEventReason,
|
||||
) {
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
locked.iter_mut().for_each(|inner| {
|
||||
inner
|
||||
.last_residence_changes
|
||||
.write(LayerResidenceEvent::new(status, reason))
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) fn record_residence_event(&self) {
|
||||
self.record_residence_event_at(SystemTime::now())
|
||||
}
|
||||
|
||||
pub(crate) fn record_access_at(&self, now: SystemTime) {
|
||||
let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
|
||||
|
||||
// A layer which is accessed must be visible.
|
||||
mask |= 0x1 << Self::VISIBILITY_SHIFT;
|
||||
value |= 0x1 << Self::VISIBILITY_SHIFT;
|
||||
|
||||
self.write_bits(mask, value);
|
||||
}
|
||||
|
||||
pub(crate) fn record_access(&self, ctx: &RequestContext) {
|
||||
fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
|
||||
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
|
||||
return;
|
||||
}
|
||||
|
||||
self.record_access_at(SystemTime::now())
|
||||
let this_access = LayerAccessStatFullDetails {
|
||||
when: SystemTime::now(),
|
||||
task_kind: ctx.task_kind(),
|
||||
access_kind,
|
||||
};
|
||||
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
locked.iter_mut().for_each(|inner| {
|
||||
inner.first_access.get_or_insert(this_access);
|
||||
inner.count_by_access_kind[access_kind] += 1;
|
||||
inner.task_kind_flag |= ctx.task_kind();
|
||||
inner.last_accesses.write(this_access);
|
||||
})
|
||||
}
|
||||
|
||||
fn as_api_model(
|
||||
&self,
|
||||
reset: LayerAccessStatsReset,
|
||||
) -> pageserver_api::models::LayerAccessStats {
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
let inner = &mut locked.for_scraping_api;
|
||||
let LayerAccessStatsInner {
|
||||
first_access,
|
||||
count_by_access_kind,
|
||||
task_kind_flag,
|
||||
last_accesses,
|
||||
last_residence_changes,
|
||||
} = inner;
|
||||
let ret = pageserver_api::models::LayerAccessStats {
|
||||
access_time: self
|
||||
.read_low_res_timestamp(Self::ATIME_SHIFT)
|
||||
.unwrap_or(UNIX_EPOCH),
|
||||
residence_time: self
|
||||
.read_low_res_timestamp(Self::RTIME_SHIFT)
|
||||
.unwrap_or(UNIX_EPOCH),
|
||||
visible: matches!(self.visibility(), LayerVisibilityHint::Visible),
|
||||
access_count_by_access_kind: count_by_access_kind
|
||||
.iter()
|
||||
.map(|(kind, count)| (kind, *count))
|
||||
.collect(),
|
||||
task_kind_access_flag: task_kind_flag
|
||||
.iter()
|
||||
.map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
|
||||
.collect(),
|
||||
first: first_access.as_ref().map(|a| a.as_api_model()),
|
||||
accesses_history: last_accesses.map(|m| m.as_api_model()),
|
||||
residence_events_history: last_residence_changes.clone(),
|
||||
};
|
||||
match reset {
|
||||
LayerAccessStatsReset::NoReset => {}
|
||||
LayerAccessStatsReset::NoReset => (),
|
||||
LayerAccessStatsReset::JustTaskKindFlags => {
|
||||
inner.task_kind_flag.clear();
|
||||
}
|
||||
LayerAccessStatsReset::AllStats => {
|
||||
self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0);
|
||||
self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0);
|
||||
*inner = LayerAccessStatsInner::default();
|
||||
}
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
/// Get the latest access timestamp, falling back to latest residence event. The latest residence event
|
||||
/// will be this Layer's construction time, if its residence hasn't changed since then.
|
||||
pub(crate) fn latest_activity(&self) -> SystemTime {
|
||||
if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) {
|
||||
t
|
||||
} else {
|
||||
self.read_low_res_timestamp(Self::RTIME_SHIFT)
|
||||
.expect("Residence time is set on construction")
|
||||
/// Get the latest access timestamp, falling back to latest residence event, further falling
|
||||
/// back to `SystemTime::now` for a usable timestamp for eviction.
|
||||
pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
|
||||
self.latest_activity().unwrap_or_else(SystemTime::now)
|
||||
}
|
||||
|
||||
/// Get the latest access timestamp, falling back to latest residence event.
|
||||
///
|
||||
/// This function can only return `None` if there has not yet been a call to the
|
||||
/// [`record_residence_event`] method. That would generally be considered an
|
||||
/// implementation error. This function logs a rate-limited warning in that case.
|
||||
///
|
||||
/// TODO: use type system to avoid the need for `fallback`.
|
||||
/// The approach in <https://github.com/neondatabase/neon/pull/3775>
|
||||
/// could be used to enforce that a residence event is recorded
|
||||
/// before a layer is added to the layer map. We could also have
|
||||
/// a layer wrapper type that holds the LayerAccessStats, and ensure
|
||||
/// that that type can only be produced by inserting into the layer map.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
fn latest_activity(&self) -> Option<SystemTime> {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
match inner.last_accesses.recent() {
|
||||
Some(a) => Some(a.when),
|
||||
None => match inner.last_residence_changes.recent() {
|
||||
Some(e) => Some(e.timestamp),
|
||||
None => {
|
||||
static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
|
||||
Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
|
||||
let mut guard = WARN_RATE_LIMIT.lock().unwrap();
|
||||
guard.0 += 1;
|
||||
let occurences = guard.0;
|
||||
guard.1.call(move || {
|
||||
warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
|
||||
});
|
||||
None
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -614,46 +680,30 @@ impl LayerAccessStats {
|
||||
/// This indicates whether the layer has been used for some purpose that would motivate
|
||||
/// us to keep it on disk, such as for serving a getpage request.
|
||||
fn accessed(&self) -> bool {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
|
||||
// Consider it accessed if the most recent access is more recent than
|
||||
// the most recent change in residence status.
|
||||
match (
|
||||
self.read_low_res_timestamp(Self::ATIME_SHIFT),
|
||||
self.read_low_res_timestamp(Self::RTIME_SHIFT),
|
||||
inner.last_accesses.recent(),
|
||||
inner.last_residence_changes.recent(),
|
||||
) {
|
||||
(None, _) => false,
|
||||
(Some(_), None) => true,
|
||||
(Some(a), Some(r)) => a >= r,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
|
||||
let value = match visibility {
|
||||
LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
|
||||
LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
|
||||
};
|
||||
|
||||
self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
|
||||
}
|
||||
|
||||
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
|
||||
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
|
||||
match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
|
||||
1 => LayerVisibilityHint::Visible,
|
||||
0 => LayerVisibilityHint::Covered,
|
||||
_ => unreachable!(),
|
||||
(Some(a), Some(r)) => a.when >= r.timestamp,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
pub(crate) trait AsLayerDesc {
|
||||
pub trait AsLayerDesc {
|
||||
/// Get the layer descriptor.
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc;
|
||||
}
|
||||
|
||||
pub mod tests {
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::*;
|
||||
|
||||
|
||||
@@ -52,7 +52,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -265,7 +265,7 @@ impl DeltaLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load(ctx).await?;
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
|
||||
inner.dump(ctx).await
|
||||
}
|
||||
@@ -298,8 +298,12 @@ impl DeltaLayer {
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
///
|
||||
async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
|
||||
self.access_stats.record_access(ctx);
|
||||
async fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&Arc<DeltaLayerInner>> {
|
||||
self.access_stats.record_access(access_kind, ctx);
|
||||
// Quick exit if already loaded
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner(ctx))
|
||||
@@ -352,7 +356,7 @@ impl DeltaLayer {
|
||||
summary.lsn_range,
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: Default::default(),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
@@ -456,12 +460,7 @@ impl DeltaLayerWriterInner {
|
||||
will_init: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||
assert!(
|
||||
self.lsn_range.start <= lsn,
|
||||
"lsn_start={}, lsn={}",
|
||||
self.lsn_range.start,
|
||||
lsn
|
||||
);
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
// We don't want to use compression in delta layer creation
|
||||
let compression = ImageCompressionAlgorithm::Disabled;
|
||||
let (val, res) = self
|
||||
|
||||
@@ -49,6 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -227,7 +228,7 @@ impl ImageLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load(ctx).await?;
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
|
||||
inner.dump(ctx).await?;
|
||||
|
||||
@@ -254,8 +255,12 @@ impl ImageLayer {
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
///
|
||||
async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
|
||||
self.access_stats.record_access(ctx);
|
||||
async fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&ImageLayerInner> {
|
||||
self.access_stats.record_access(access_kind, ctx);
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner(ctx))
|
||||
.await
|
||||
@@ -307,7 +312,7 @@ impl ImageLayer {
|
||||
metadata.len(),
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: summary.lsn,
|
||||
access_stats: Default::default(),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ use anyhow::{anyhow, ensure, Result};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, BinaryHeap, HashSet};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
@@ -375,6 +375,15 @@ impl InMemoryLayer {
|
||||
let inner = self.inner.read().await;
|
||||
let reader = inner.file.block_cursor();
|
||||
|
||||
#[derive(Eq, PartialEq, Ord, PartialOrd)]
|
||||
struct BlockRead {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
block_offset: u64,
|
||||
}
|
||||
|
||||
let mut planned_block_reads = BinaryHeap::new();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
for (key, vec_map) in inner.index.range(range.start..range.end) {
|
||||
let lsn_range = match reconstruct_state.get_cached_lsn(key) {
|
||||
@@ -383,32 +392,49 @@ impl InMemoryLayer {
|
||||
};
|
||||
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
// TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
|
||||
let buf = reader.read_blob(*pos, &ctx).await;
|
||||
if let Err(e) = buf {
|
||||
reconstruct_state
|
||||
.on_key_error(*key, PageReconstructError::from(anyhow!(e)));
|
||||
break;
|
||||
}
|
||||
|
||||
let value = Value::des(&buf.unwrap());
|
||||
if let Err(e) = value {
|
||||
reconstruct_state
|
||||
.on_key_error(*key, PageReconstructError::from(anyhow!(e)));
|
||||
break;
|
||||
}
|
||||
|
||||
let key_situation =
|
||||
reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
break;
|
||||
}
|
||||
planned_block_reads.push(BlockRead {
|
||||
key: *key,
|
||||
lsn: *entry_lsn,
|
||||
block_offset: *pos,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let keyspace_size = keyspace.total_raw_size();
|
||||
|
||||
let mut completed_keys = HashSet::new();
|
||||
while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
|
||||
let block_read = planned_block_reads.pop().unwrap();
|
||||
if completed_keys.contains(&block_read.key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
|
||||
let buf = reader.read_blob(block_read.block_offset, &ctx).await;
|
||||
if let Err(e) = buf {
|
||||
reconstruct_state
|
||||
.on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
|
||||
completed_keys.insert(block_read.key);
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = Value::des(&buf.unwrap());
|
||||
if let Err(e) = value {
|
||||
reconstruct_state
|
||||
.on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
|
||||
completed_keys.insert(block_read.key);
|
||||
continue;
|
||||
}
|
||||
|
||||
let key_situation =
|
||||
reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
completed_keys.insert(block_read.key);
|
||||
}
|
||||
}
|
||||
|
||||
reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::HistoricLayerInfo;
|
||||
use pageserver_api::models::{
|
||||
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
@@ -158,10 +160,13 @@ impl Layer {
|
||||
metadata.file_size,
|
||||
);
|
||||
|
||||
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
|
||||
|
||||
let owner = Layer(Arc::new(LayerInner::new(
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
access_stats,
|
||||
desc,
|
||||
None,
|
||||
metadata.generation,
|
||||
@@ -188,6 +193,8 @@ impl Layer {
|
||||
metadata.file_size,
|
||||
);
|
||||
|
||||
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
|
||||
|
||||
let mut resident = None;
|
||||
|
||||
let owner = Layer(Arc::new_cyclic(|owner| {
|
||||
@@ -202,6 +209,7 @@ impl Layer {
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
access_stats,
|
||||
desc,
|
||||
Some(inner),
|
||||
metadata.generation,
|
||||
@@ -237,6 +245,11 @@ impl Layer {
|
||||
version: 0,
|
||||
});
|
||||
resident = Some(inner.clone());
|
||||
let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
|
||||
access_stats.record_residence_event(
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
@@ -246,22 +259,16 @@ impl Layer {
|
||||
&timeline.generation,
|
||||
);
|
||||
|
||||
let layer = LayerInner::new(
|
||||
LayerInner::new(
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
access_stats,
|
||||
desc,
|
||||
Some(inner),
|
||||
timeline.generation,
|
||||
timeline.get_shard_index(),
|
||||
);
|
||||
|
||||
// Newly created layers are marked visible by default: the usual case is that they were created to be read.
|
||||
layer
|
||||
.access_stats
|
||||
.set_visibility(super::LayerVisibilityHint::Visible);
|
||||
|
||||
layer
|
||||
)
|
||||
}));
|
||||
|
||||
let downloaded = resident.expect("just initialized");
|
||||
@@ -325,7 +332,9 @@ impl Layer {
|
||||
use anyhow::ensure;
|
||||
|
||||
let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
|
||||
self.0.access_stats.record_access(ctx);
|
||||
self.0
|
||||
.access_stats
|
||||
.record_access(LayerAccessKind::GetValueReconstructData, ctx);
|
||||
|
||||
if self.layer_desc().is_delta {
|
||||
ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
|
||||
@@ -359,7 +368,9 @@ impl Layer {
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
|
||||
self.0.access_stats.record_access(ctx);
|
||||
self.0
|
||||
.access_stats
|
||||
.record_access(LayerAccessKind::GetValueReconstructData, ctx);
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
|
||||
@@ -775,6 +786,7 @@ impl LayerInner {
|
||||
conf: &'static PageServerConf,
|
||||
timeline: &Arc<Timeline>,
|
||||
local_path: Utf8PathBuf,
|
||||
access_stats: LayerAccessStats,
|
||||
desc: PersistentLayerDesc,
|
||||
downloaded: Option<Arc<DownloadedLayer>>,
|
||||
generation: Generation,
|
||||
@@ -809,7 +821,7 @@ impl LayerInner {
|
||||
path: local_path,
|
||||
desc,
|
||||
timeline: Arc::downgrade(timeline),
|
||||
access_stats: Default::default(),
|
||||
access_stats,
|
||||
wanted_deleted: AtomicBool::new(false),
|
||||
inner,
|
||||
version: AtomicUsize::new(version),
|
||||
@@ -1164,7 +1176,10 @@ impl LayerInner {
|
||||
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
|
||||
}
|
||||
|
||||
self.access_stats.record_residence_event();
|
||||
self.access_stats.record_residence_event(
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::ResidenceChange,
|
||||
);
|
||||
|
||||
Ok(self.initialize_after_layer_is_on_disk(permit))
|
||||
}
|
||||
@@ -1283,7 +1298,7 @@ impl LayerInner {
|
||||
lsn_end: lsn_range.end,
|
||||
remote: !resident,
|
||||
access_stats,
|
||||
l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
|
||||
l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
|
||||
}
|
||||
} else {
|
||||
let lsn = self.desc.image_layer_lsn();
|
||||
@@ -1518,7 +1533,10 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
self.access_stats.record_residence_event();
|
||||
self.access_stats.record_residence_event(
|
||||
LayerResidenceStatus::Evicted,
|
||||
LayerResidenceEventReason::ResidenceChange,
|
||||
);
|
||||
|
||||
self.status.as_ref().unwrap().send_replace(Status::Evicted);
|
||||
|
||||
@@ -1844,7 +1862,9 @@ impl ResidentLayer {
|
||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||
// while it's being held.
|
||||
owner.access_stats.record_access(ctx);
|
||||
owner
|
||||
.access_stats
|
||||
.record_access(LayerAccessKind::KeyIter, ctx);
|
||||
|
||||
delta_layer::DeltaLayerInner::load_keys(d, ctx)
|
||||
.await
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use pageserver_api::key::CONTROLFILE_KEY;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::{
|
||||
@@ -9,7 +7,7 @@ use utils::{
|
||||
|
||||
use super::failpoints::{Failpoint, FailpointKind};
|
||||
use super::*;
|
||||
use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
|
||||
|
||||
/// Used in tests to advance a future to wanted await point, and not futher.
|
||||
@@ -828,9 +826,9 @@ async fn eviction_cancellation_on_drop() {
|
||||
#[test]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn layer_size() {
|
||||
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 8);
|
||||
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
|
||||
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
|
||||
assert_eq!(std::mem::size_of::<LayerInner>(), 312);
|
||||
assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
|
||||
// it also has the utf8 path
|
||||
}
|
||||
|
||||
@@ -970,46 +968,3 @@ fn spawn_blocking_pool_helper_actually_works() {
|
||||
println!("joined");
|
||||
});
|
||||
}
|
||||
|
||||
/// Drop the low bits from a time, to emulate the precision loss in LayerAccessStats
|
||||
fn lowres_time(hires: SystemTime) -> SystemTime {
|
||||
let ts = hires.duration_since(UNIX_EPOCH).unwrap().as_secs();
|
||||
UNIX_EPOCH + Duration::from_secs(ts)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn access_stats() {
|
||||
let access_stats = LayerAccessStats::default();
|
||||
// Default is visible
|
||||
assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible);
|
||||
|
||||
access_stats.set_visibility(LayerVisibilityHint::Covered);
|
||||
assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
|
||||
access_stats.set_visibility(LayerVisibilityHint::Visible);
|
||||
assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible);
|
||||
|
||||
let rtime = UNIX_EPOCH + Duration::from_secs(2000000000);
|
||||
access_stats.record_residence_event_at(rtime);
|
||||
assert_eq!(access_stats.latest_activity(), lowres_time(rtime));
|
||||
|
||||
let atime = UNIX_EPOCH + Duration::from_secs(2100000000);
|
||||
access_stats.record_access_at(atime);
|
||||
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
|
||||
|
||||
// Setting visibility doesn't clobber access time
|
||||
access_stats.set_visibility(LayerVisibilityHint::Covered);
|
||||
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
|
||||
access_stats.set_visibility(LayerVisibilityHint::Visible);
|
||||
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn access_stats_2038() {
|
||||
// The access stats structure uses a timestamp representation that will run out
|
||||
// of bits in 2038. One year before that, this unit test will start failing.
|
||||
|
||||
let one_year_from_now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap()
|
||||
+ Duration::from_secs(3600 * 24 * 365);
|
||||
|
||||
assert!(one_year_from_now.as_secs() < (2 << 31));
|
||||
}
|
||||
|
||||
@@ -248,14 +248,6 @@ impl LayerName {
|
||||
Image(_) => "image",
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the key range encoded in the layer name.
|
||||
pub fn key_range(&self) -> &Range<Key> {
|
||||
match &self {
|
||||
LayerName::Image(layer) => &layer.key_range,
|
||||
LayerName::Delta(layer) => &layer.key_range,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for LayerName {
|
||||
|
||||
@@ -101,6 +101,7 @@ pub fn start_background_loops(
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
&format!("compactor for tenant {tenant_shard_id}"),
|
||||
false,
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
@@ -124,6 +125,7 @@ pub fn start_background_loops(
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
&format!("garbage collector for tenant {tenant_shard_id}"),
|
||||
false,
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
@@ -147,6 +149,7 @@ pub fn start_background_loops(
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
|
||||
false,
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
pub(crate) mod analysis;
|
||||
pub(crate) mod compaction;
|
||||
mod compaction;
|
||||
pub mod delete;
|
||||
pub(crate) mod detach_ancestor;
|
||||
mod eviction_task;
|
||||
@@ -460,7 +460,7 @@ pub(crate) struct GcInfo {
|
||||
/// Currently, this includes all points where child branches have
|
||||
/// been forked off from. In the future, could also include
|
||||
/// explicit user-defined snapshot points.
|
||||
pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>,
|
||||
pub(crate) retain_lsns: Vec<Lsn>,
|
||||
|
||||
/// The cutoff coordinates, which are combined by selecting the minimum.
|
||||
pub(crate) cutoffs: GcCutoffs,
|
||||
@@ -476,21 +476,12 @@ impl GcInfo {
|
||||
pub(crate) fn min_cutoff(&self) -> Lsn {
|
||||
self.cutoffs.select_min()
|
||||
}
|
||||
|
||||
pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) {
|
||||
self.retain_lsns.push((child_lsn, child_id));
|
||||
self.retain_lsns.sort_by_key(|i| i.0);
|
||||
}
|
||||
|
||||
pub(super) fn remove_child(&mut self, child_id: TimelineId) {
|
||||
self.retain_lsns.retain(|i| i.1 != child_id);
|
||||
}
|
||||
}
|
||||
|
||||
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
|
||||
/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
|
||||
/// between time-based and space-based retention for observability and consumption metrics purposes.
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct GcCutoffs {
|
||||
/// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
|
||||
/// history we must keep to retain a specified number of bytes of WAL.
|
||||
@@ -2316,11 +2307,6 @@ impl Timeline {
|
||||
)
|
||||
};
|
||||
|
||||
if let Some(ancestor) = &ancestor {
|
||||
let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
|
||||
ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn());
|
||||
}
|
||||
|
||||
Arc::new_cyclic(|myself| {
|
||||
let metrics = TimelineMetrics::new(
|
||||
&tenant_shard_id,
|
||||
@@ -2491,6 +2477,7 @@ impl Timeline {
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
"layer flush task",
|
||||
false,
|
||||
async move {
|
||||
let _guard = guard;
|
||||
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
|
||||
@@ -2835,6 +2822,7 @@ impl Timeline {
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
"initial size calculation",
|
||||
false,
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
@@ -3003,6 +2991,7 @@ impl Timeline {
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
"ondemand logical size calculation",
|
||||
false,
|
||||
async move {
|
||||
let res = self_clone
|
||||
.logical_size_calculation_task(lsn, cause, &ctx)
|
||||
@@ -3169,7 +3158,7 @@ impl Timeline {
|
||||
let guard = self.layers.read().await;
|
||||
|
||||
let resident = guard.likely_resident_layers().map(|layer| {
|
||||
let last_activity_ts = layer.access_stats().latest_activity();
|
||||
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||
|
||||
HeatMapLayer::new(
|
||||
layer.layer_desc().layer_name(),
|
||||
@@ -4767,18 +4756,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Timeline {
|
||||
fn drop(&mut self) {
|
||||
if let Some(ancestor) = &self.ancestor_timeline {
|
||||
// This lock should never be poisoned, but in case it is we do a .map() instead of
|
||||
// an unwrap(), to avoid panicking in a destructor and thereby aborting the process.
|
||||
if let Ok(mut gc_info) = ancestor.gc_info.write() {
|
||||
gc_info.remove_child(self.timeline_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Top-level failure to compact.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum CompactionError {
|
||||
@@ -4891,7 +4868,7 @@ impl Timeline {
|
||||
// for compact_level0_phase1 creating an L0, which does not happen in practice
|
||||
// because we have not implemented L0 => L0 compaction.
|
||||
duplicated_layers.insert(l.layer_desc().key());
|
||||
} else if LayerMap::is_l0(&l.layer_desc().key_range) {
|
||||
} else if LayerMap::is_l0(l.layer_desc()) {
|
||||
bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
|
||||
} else {
|
||||
insert_layers.push(l.clone());
|
||||
@@ -5096,11 +5073,7 @@ impl Timeline {
|
||||
|
||||
let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
|
||||
let time_cutoff = gc_info.cutoffs.time;
|
||||
let retain_lsns = gc_info
|
||||
.retain_lsns
|
||||
.iter()
|
||||
.map(|(lsn, _child_id)| *lsn)
|
||||
.collect();
|
||||
let retain_lsns = gc_info.retain_lsns.clone();
|
||||
|
||||
// Gets the maximum LSN that holds the valid lease.
|
||||
//
|
||||
@@ -5462,6 +5435,7 @@ impl Timeline {
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
"download all remote layers task",
|
||||
false,
|
||||
async move {
|
||||
self_clone.download_all_remote_layers(request).await;
|
||||
let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
|
||||
@@ -5612,7 +5586,7 @@ impl Timeline {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||
|
||||
let last_activity_ts = layer.access_stats().latest_activity();
|
||||
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||
|
||||
EvictionCandidate {
|
||||
layer: layer.into(),
|
||||
|
||||
@@ -28,7 +28,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
|
||||
use crate::page_cache;
|
||||
use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
|
||||
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
@@ -36,7 +36,7 @@ use crate::tenant::DeltaLayer;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::repository::Key;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -45,60 +45,6 @@ use pageserver_compaction::interface::*;
|
||||
|
||||
use super::CompactionError;
|
||||
|
||||
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
|
||||
const COMPACTION_DELTA_THRESHOLD: usize = 5;
|
||||
|
||||
/// The result of bottom-most compaction for a single key at each LSN.
|
||||
#[derive(Debug)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>);
|
||||
|
||||
/// The result of bottom-most compaction.
|
||||
#[derive(Debug)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
pub(crate) struct KeyHistoryRetention {
|
||||
/// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN.
|
||||
pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>,
|
||||
/// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN.
|
||||
pub(crate) above_horizon: KeyLogAtLsn,
|
||||
}
|
||||
|
||||
impl KeyHistoryRetention {
|
||||
async fn pipe_to(
|
||||
self,
|
||||
key: Key,
|
||||
delta_writer: &mut Vec<(Key, Lsn, Value)>,
|
||||
image_writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut first_batch = true;
|
||||
for (_, KeyLogAtLsn(logs)) in self.below_horizon {
|
||||
if first_batch {
|
||||
if logs.len() == 1 && logs[0].1.is_image() {
|
||||
let Value::Image(img) = &logs[0].1 else {
|
||||
unreachable!()
|
||||
};
|
||||
image_writer.put_image(key, img.clone(), ctx).await?;
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
delta_writer.push((key, lsn, val));
|
||||
}
|
||||
}
|
||||
first_batch = false;
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
delta_writer.push((key, lsn, val));
|
||||
}
|
||||
}
|
||||
}
|
||||
let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
|
||||
for (lsn, val) in above_horizon_logs {
|
||||
delta_writer.push((key, lsn, val));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// TODO: cancellation
|
||||
pub(crate) async fn compact_legacy(
|
||||
@@ -1043,188 +989,6 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns.
|
||||
///
|
||||
/// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon.
|
||||
/// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is
|
||||
/// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch.
|
||||
///
|
||||
/// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have:
|
||||
///
|
||||
/// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60
|
||||
/// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3
|
||||
///
|
||||
/// The function will produce:
|
||||
///
|
||||
/// ```plain
|
||||
/// 0x20(retain_lsn) -> img=AB@0x20 always produce a single image below the lowest retain LSN
|
||||
/// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40] two deltas since the last base image, keeping the deltas
|
||||
/// 0x50(horizon) -> deltas=[ABCDE@0x50] three deltas since the last base image, generate an image but put it in the delta
|
||||
/// above_horizon -> deltas=[+F@0x60] full history above the horizon
|
||||
/// ```
|
||||
///
|
||||
/// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
|
||||
pub(crate) async fn generate_key_retention(
|
||||
self: &Arc<Timeline>,
|
||||
key: Key,
|
||||
history: &[(Key, Lsn, Value)],
|
||||
horizon: Lsn,
|
||||
retain_lsn_below_horizon: &[Lsn],
|
||||
delta_threshold_cnt: usize,
|
||||
) -> anyhow::Result<KeyHistoryRetention> {
|
||||
// Pre-checks for the invariants
|
||||
if cfg!(debug_assertions) {
|
||||
for (log_key, _, _) in history {
|
||||
assert_eq!(log_key, &key, "mismatched key");
|
||||
}
|
||||
for i in 1..history.len() {
|
||||
assert!(history[i - 1].1 <= history[i].1, "unordered LSN");
|
||||
if history[i - 1].1 == history[i].1 {
|
||||
assert!(
|
||||
matches!(history[i - 1].2, Value::Image(_)),
|
||||
"unordered delta/image, or duplicated delta"
|
||||
);
|
||||
}
|
||||
}
|
||||
if let Value::WalRecord(rec) = &history[0].2 {
|
||||
assert!(rec.will_init(), "no base image");
|
||||
}
|
||||
for lsn in retain_lsn_below_horizon {
|
||||
assert!(lsn < &horizon, "retain lsn must be below horizon")
|
||||
}
|
||||
for i in 1..retain_lsn_below_horizon.len() {
|
||||
assert!(
|
||||
retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i],
|
||||
"unordered LSN"
|
||||
);
|
||||
}
|
||||
}
|
||||
// Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
|
||||
// and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
|
||||
let (mut split_history, lsn_split_points) = {
|
||||
let mut split_history = Vec::new();
|
||||
split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new);
|
||||
let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1);
|
||||
for lsn in retain_lsn_below_horizon {
|
||||
lsn_split_points.push(*lsn);
|
||||
}
|
||||
lsn_split_points.push(horizon);
|
||||
let mut current_idx = 0;
|
||||
for item @ (_, lsn, _) in history {
|
||||
while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
|
||||
current_idx += 1;
|
||||
}
|
||||
split_history[current_idx].push(item);
|
||||
}
|
||||
(split_history, lsn_split_points)
|
||||
};
|
||||
// Step 2: filter out duplicated records due to the k-merge of image/delta layers
|
||||
for split_for_lsn in &mut split_history {
|
||||
let mut prev_lsn = None;
|
||||
let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len());
|
||||
for record @ (_, lsn, _) in std::mem::take(split_for_lsn) {
|
||||
if let Some(prev_lsn) = &prev_lsn {
|
||||
if *prev_lsn == lsn {
|
||||
// The case that we have an LSN with both data from the delta layer and the image layer. As
|
||||
// `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
|
||||
// drop this delta and keep the image.
|
||||
//
|
||||
// For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
|
||||
// keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
|
||||
// dropped.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
prev_lsn = Some(lsn);
|
||||
new_split_for_lsn.push(record);
|
||||
}
|
||||
*split_for_lsn = new_split_for_lsn;
|
||||
}
|
||||
// Step 3: generate images when necessary
|
||||
let mut retention = Vec::with_capacity(split_history.len());
|
||||
let mut records_since_last_image = 0;
|
||||
let batch_cnt = split_history.len();
|
||||
assert!(
|
||||
batch_cnt >= 2,
|
||||
"should have at least below + above horizon batches"
|
||||
);
|
||||
let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
|
||||
for (i, split_for_lsn) in split_history.into_iter().enumerate() {
|
||||
records_since_last_image += split_for_lsn.len();
|
||||
let generate_image = if i == 0 {
|
||||
// We always generate images for the first batch (below horizon / lowest retain_lsn)
|
||||
true
|
||||
} else if i == batch_cnt - 1 {
|
||||
// Do not generate images for the last batch (above horizon)
|
||||
false
|
||||
} else if records_since_last_image >= delta_threshold_cnt {
|
||||
// Generate images when there are too many records
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone()));
|
||||
if let Some((_, _, val)) = replay_history.first() {
|
||||
assert!(val.will_init(), "invalid history, no base image");
|
||||
}
|
||||
// Only retain the items after the last image record
|
||||
for idx in (0..replay_history.len()).rev() {
|
||||
if replay_history[idx].2.will_init() {
|
||||
replay_history = replay_history[idx..].to_vec();
|
||||
break;
|
||||
}
|
||||
}
|
||||
if generate_image && records_since_last_image > 0 {
|
||||
records_since_last_image = 0;
|
||||
let history = std::mem::take(&mut replay_history);
|
||||
let mut img = None;
|
||||
let mut records = Vec::with_capacity(history.len());
|
||||
if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
|
||||
img = Some((*lsn, val.clone()));
|
||||
for (_, lsn, val) in history.into_iter().skip(1) {
|
||||
let Value::WalRecord(rec) = val else {
|
||||
panic!("invalid record")
|
||||
};
|
||||
records.push((lsn, rec));
|
||||
}
|
||||
} else {
|
||||
for (_, lsn, val) in history.into_iter() {
|
||||
let Value::WalRecord(rec) = val else {
|
||||
panic!("invalid record")
|
||||
};
|
||||
records.push((lsn, rec));
|
||||
}
|
||||
}
|
||||
records.reverse();
|
||||
let state = ValueReconstructState { img, records };
|
||||
let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
|
||||
let img = self.reconstruct_value(key, request_lsn, state).await?;
|
||||
replay_history.push((key, request_lsn, Value::Image(img.clone())));
|
||||
retention.push(vec![(request_lsn, Value::Image(img))]);
|
||||
} else {
|
||||
retention.push(
|
||||
split_for_lsn
|
||||
.iter()
|
||||
.map(|(_, lsn, value)| (*lsn, value.clone()))
|
||||
.collect(),
|
||||
);
|
||||
}
|
||||
}
|
||||
let mut result = Vec::with_capacity(retention.len());
|
||||
assert_eq!(retention.len(), lsn_split_points.len() + 1);
|
||||
for (idx, logs) in retention.into_iter().enumerate() {
|
||||
if idx == lsn_split_points.len() {
|
||||
return Ok(KeyHistoryRetention {
|
||||
below_horizon: result,
|
||||
above_horizon: KeyLogAtLsn(logs),
|
||||
});
|
||||
} else {
|
||||
result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
|
||||
}
|
||||
}
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
/// An experimental compaction building block that combines compaction with garbage collection.
|
||||
///
|
||||
/// The current implementation picks all delta + image layers that are below or intersecting with
|
||||
@@ -1236,6 +1000,7 @@ impl Timeline {
|
||||
_cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
use crate::tenant::storage_layer::ValueReconstructState;
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
info!("running enhanced gc bottom-most compaction");
|
||||
@@ -1248,51 +1013,30 @@ impl Timeline {
|
||||
// The layer selection has the following properties:
|
||||
// 1. If a layer is in the selection, all layers below it are in the selection.
|
||||
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
|
||||
let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
|
||||
let (layer_selection, gc_cutoff) = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let mut retain_lsns_below_horizon = Vec::new();
|
||||
if !gc_info.retain_lsns.is_empty() || !gc_info.leases.is_empty() {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"enhanced legacy compaction currently does not support retain_lsns (branches)"
|
||||
)));
|
||||
}
|
||||
let gc_cutoff = gc_info.cutoffs.select_min();
|
||||
for (lsn, _timeline_id) in &gc_info.retain_lsns {
|
||||
if lsn < &gc_cutoff {
|
||||
retain_lsns_below_horizon.push(*lsn);
|
||||
}
|
||||
}
|
||||
for lsn in gc_info.leases.keys() {
|
||||
if lsn < &gc_cutoff {
|
||||
retain_lsns_below_horizon.push(*lsn);
|
||||
}
|
||||
}
|
||||
let mut selected_layers = Vec::new();
|
||||
// TODO: consider retain_lsns
|
||||
drop(gc_info);
|
||||
for desc in layers.iter_historic_layers() {
|
||||
if desc.get_lsn_range().start <= gc_cutoff {
|
||||
selected_layers.push(guard.get_from_desc(&desc));
|
||||
}
|
||||
}
|
||||
retain_lsns_below_horizon.sort();
|
||||
(selected_layers, gc_cutoff, retain_lsns_below_horizon)
|
||||
(selected_layers, gc_cutoff)
|
||||
};
|
||||
let lowest_retain_lsn = retain_lsns_below_horizon
|
||||
.first()
|
||||
.copied()
|
||||
.unwrap_or(gc_cutoff);
|
||||
if cfg!(debug_assertions) {
|
||||
assert_eq!(
|
||||
lowest_retain_lsn,
|
||||
retain_lsns_below_horizon
|
||||
.iter()
|
||||
.min()
|
||||
.copied()
|
||||
.unwrap_or(gc_cutoff)
|
||||
);
|
||||
}
|
||||
info!(
|
||||
"picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
|
||||
"picked {} layers for compaction with gc_cutoff={}",
|
||||
layer_selection.len(),
|
||||
gc_cutoff,
|
||||
lowest_retain_lsn
|
||||
gc_cutoff
|
||||
);
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, collect the layer information to decide when to split the new delta layers.
|
||||
@@ -1328,13 +1072,61 @@ impl Timeline {
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key: Option<Key> = None;
|
||||
|
||||
/// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
|
||||
async fn flush_accumulated_states(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
accumulated_values: &[(Key, Lsn, crate::repository::Value)],
|
||||
horizon: Lsn,
|
||||
) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
|
||||
let mut base_image = None;
|
||||
let mut keys_above_horizon = Vec::new();
|
||||
let mut delta_above_base_image = Vec::new();
|
||||
// We have a list of deltas/images. We want to create image layers while collect garbages.
|
||||
for (key, lsn, val) in accumulated_values.iter().rev() {
|
||||
if *lsn > horizon {
|
||||
if let Some((_, prev_lsn, _)) = keys_above_horizon.last_mut() {
|
||||
if *prev_lsn == *lsn {
|
||||
// The case that we have an LSN with both data from the delta layer and the image layer. As
|
||||
// `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
|
||||
// drop this delta and keep the image.
|
||||
//
|
||||
// For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
|
||||
// keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
|
||||
// dropped.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
keys_above_horizon.push((*key, *lsn, val.clone()));
|
||||
} else if *lsn <= horizon {
|
||||
match val {
|
||||
crate::repository::Value::Image(image) => {
|
||||
base_image = Some((*lsn, image.clone()));
|
||||
break;
|
||||
}
|
||||
crate::repository::Value::WalRecord(wal) => {
|
||||
delta_above_base_image.push((*lsn, wal.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// do not reverse delta_above_base_image, reconstruct state expects reversely-ordered records
|
||||
keys_above_horizon.reverse();
|
||||
let state = ValueReconstructState {
|
||||
img: base_image,
|
||||
records: delta_above_base_image,
|
||||
};
|
||||
let img = tline.reconstruct_value(key, horizon, state).await?;
|
||||
Ok((keys_above_horizon, img))
|
||||
}
|
||||
|
||||
async fn flush_deltas(
|
||||
deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
|
||||
last_key: Key,
|
||||
delta_split_points: &[Key],
|
||||
current_delta_split_point: &mut usize,
|
||||
tline: &Arc<Timeline>,
|
||||
lowest_retain_lsn: Lsn,
|
||||
gc_cutoff: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<ResidentLayer>> {
|
||||
// Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
|
||||
@@ -1369,7 +1161,7 @@ impl Timeline {
|
||||
tline.timeline_id,
|
||||
tline.tenant_shard_id,
|
||||
deltas.first().unwrap().0,
|
||||
lowest_retain_lsn..end_lsn,
|
||||
gc_cutoff..end_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -1386,7 +1178,7 @@ impl Timeline {
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(Key::MIN..Key::MAX), // covers the full key range
|
||||
lowest_retain_lsn,
|
||||
gc_cutoff,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -1403,19 +1195,12 @@ impl Timeline {
|
||||
accumulated_values.push((key, lsn, val));
|
||||
} else {
|
||||
let last_key = last_key.as_mut().unwrap();
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
*last_key,
|
||||
&accumulated_values,
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
)
|
||||
.await?;
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
|
||||
.await?;
|
||||
image_layer_writer.put_image(*last_key, image, ctx).await?;
|
||||
delta_values.extend(deltas);
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
&mut delta_values,
|
||||
@@ -1423,7 +1208,7 @@ impl Timeline {
|
||||
&delta_split_points,
|
||||
&mut current_delta_split_point,
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
gc_cutoff,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
@@ -1436,19 +1221,11 @@ impl Timeline {
|
||||
|
||||
let last_key = last_key.expect("no keys produced during compaction");
|
||||
// TODO: move this part to the loop body
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
last_key,
|
||||
&accumulated_values,
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
)
|
||||
.await?;
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
|
||||
.await?;
|
||||
image_layer_writer.put_image(last_key, image, ctx).await?;
|
||||
delta_values.extend(deltas);
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
&mut delta_values,
|
||||
@@ -1456,7 +1233,7 @@ impl Timeline {
|
||||
&delta_split_points,
|
||||
&mut current_delta_split_point,
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
gc_cutoff,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
|
||||
@@ -148,14 +148,14 @@ async fn cleanup_remaining_timeline_fs_traces(
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
async fn remove_timeline_from_tenant(
|
||||
tenant: &Tenant,
|
||||
timeline: &Timeline,
|
||||
timeline_id: TimelineId,
|
||||
_: &DeletionGuard, // using it as a witness
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove the timeline from the map.
|
||||
let mut timelines = tenant.timelines.lock().unwrap();
|
||||
let children_exist = timelines
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
|
||||
// We already deleted the layer files, so it's probably best to panic.
|
||||
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
|
||||
@@ -164,7 +164,7 @@ async fn remove_timeline_from_tenant(
|
||||
}
|
||||
|
||||
timelines
|
||||
.remove(&timeline.timeline_id)
|
||||
.remove(&timeline_id)
|
||||
.expect("timeline that we were deleting was concurrently removed from 'timelines' map");
|
||||
|
||||
drop(timelines);
|
||||
@@ -391,6 +391,7 @@ impl DeleteTimelineFlow {
|
||||
Some(tenant_shard_id),
|
||||
Some(timeline_id),
|
||||
"timeline_delete",
|
||||
false,
|
||||
async move {
|
||||
if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
|
||||
error!("Error: {err:#}");
|
||||
@@ -414,7 +415,7 @@ impl DeleteTimelineFlow {
|
||||
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
|
||||
remove_timeline_from_tenant(tenant, timeline, &guard).await?;
|
||||
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ impl Timeline {
|
||||
"layer eviction for {}/{}",
|
||||
self.tenant_shard_id, self.timeline_id
|
||||
),
|
||||
false,
|
||||
async move {
|
||||
tokio::select! {
|
||||
_ = self_clone.cancel.cancelled() => { return Ok(()); }
|
||||
@@ -225,7 +226,7 @@ impl Timeline {
|
||||
continue;
|
||||
}
|
||||
|
||||
let last_activity_ts = layer.access_stats().latest_activity();
|
||||
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
|
||||
@@ -255,14 +255,6 @@ impl LayerManager {
|
||||
new_layer.layer_desc().lsn_range
|
||||
);
|
||||
|
||||
// Transfer visibilty hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to
|
||||
// be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
|
||||
// always marking rewritten layers as visible.
|
||||
new_layer
|
||||
.as_ref()
|
||||
.access_stats()
|
||||
.set_visibility(old_layer.access_stats().visibility());
|
||||
|
||||
// Safety: we may never rewrite the same file in-place. Callers are responsible
|
||||
// for ensuring that they only rewrite layers after something changes the path,
|
||||
// such as an increment in the generation number.
|
||||
|
||||
@@ -597,7 +597,7 @@ _PG_init(void)
|
||||
|
||||
pg_init_libpagestore();
|
||||
pg_init_walproposer();
|
||||
WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
|
||||
InitLogicalReplicationMonitor();
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# neon extension
|
||||
comment = 'cloud storage for PostgreSQL'
|
||||
default_version = '1.4'
|
||||
default_version = '1.3'
|
||||
module_pathname = '$libdir/neon'
|
||||
relocatable = true
|
||||
trusted = true
|
||||
|
||||
@@ -220,8 +220,7 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
/* we'll poll immediately */
|
||||
state->rem_state = RS_CONNECTING_WRITE;
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
state->rem_state = RS_CONNECTING_READ;
|
||||
}
|
||||
|
||||
if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
|
||||
|
||||
@@ -7,12 +7,6 @@ AS 'MODULE_PATHNAME', 'test_consume_xids'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION test_consume_oids(oid int)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'test_consume_oids'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION test_consume_cpu(seconds int)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'test_consume_cpu'
|
||||
|
||||
@@ -35,7 +35,6 @@ PG_MODULE_MAGIC;
|
||||
extern void _PG_init(void);
|
||||
|
||||
PG_FUNCTION_INFO_V1(test_consume_xids);
|
||||
PG_FUNCTION_INFO_V1(test_consume_oids);
|
||||
PG_FUNCTION_INFO_V1(test_consume_cpu);
|
||||
PG_FUNCTION_INFO_V1(test_consume_memory);
|
||||
PG_FUNCTION_INFO_V1(test_release_memory);
|
||||
@@ -75,21 +74,6 @@ _PG_init(void)
|
||||
|
||||
#define neon_read_at_lsn neon_read_at_lsn_ptr
|
||||
|
||||
/*
|
||||
* test_consume_oids(int4), for rapidly consuming OIDs, to test wraparound.
|
||||
* Unlike test_consume_xids which is passed number of xids to be consumed,
|
||||
* this function is given the target Oid.
|
||||
*/
|
||||
Datum
|
||||
test_consume_oids(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int32 oid = PG_GETARG_INT32(0);
|
||||
|
||||
while (oid != GetNewObjectId());
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
/*
|
||||
* test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
|
||||
*/
|
||||
|
||||
@@ -717,10 +717,8 @@ mod tests {
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
});
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
|
||||
EndpointRateLimiter::DEFAULT,
|
||||
64,
|
||||
));
|
||||
let endpoint_rate_limiter =
|
||||
Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
|
||||
|
||||
let _creds = auth_quirks(
|
||||
&mut ctx,
|
||||
@@ -769,10 +767,8 @@ mod tests {
|
||||
frontend::password_message(b"my-secret-password", &mut write).unwrap();
|
||||
client.write_all(&write).await.unwrap();
|
||||
});
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
|
||||
EndpointRateLimiter::DEFAULT,
|
||||
64,
|
||||
));
|
||||
let endpoint_rate_limiter =
|
||||
Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
|
||||
|
||||
let _creds = auth_quirks(
|
||||
&mut ctx,
|
||||
@@ -822,10 +818,8 @@ mod tests {
|
||||
client.write_all(&write).await.unwrap();
|
||||
});
|
||||
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
|
||||
EndpointRateLimiter::DEFAULT,
|
||||
64,
|
||||
));
|
||||
let endpoint_rate_limiter =
|
||||
Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
|
||||
|
||||
let creds = auth_quirks(
|
||||
&mut ctx,
|
||||
|
||||
@@ -22,9 +22,7 @@ use proxy::http;
|
||||
use proxy::http::health_server::AppMetrics;
|
||||
use proxy::metrics::Metrics;
|
||||
use proxy::rate_limiter::EndpointRateLimiter;
|
||||
use proxy::rate_limiter::LeakyBucketConfig;
|
||||
use proxy::rate_limiter::RateBucketInfo;
|
||||
use proxy::rate_limiter::WakeComputeRateLimiter;
|
||||
use proxy::redis::cancellation_publisher::RedisPublisherClient;
|
||||
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use proxy::redis::elasticache;
|
||||
@@ -178,9 +176,6 @@ struct ProxyCliArgs {
|
||||
/// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
|
||||
#[clap(long)]
|
||||
redis_notifications: Option<String>,
|
||||
/// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
|
||||
#[clap(long, default_value = "irsa")]
|
||||
redis_auth_type: String,
|
||||
/// redis host for streaming connections (might be different from the notifications host)
|
||||
#[clap(long)]
|
||||
redis_host: Option<String>,
|
||||
@@ -324,38 +319,24 @@ async fn main() -> anyhow::Result<()> {
|
||||
),
|
||||
aws_credentials_provider,
|
||||
));
|
||||
let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
|
||||
("plain", redis_url) => match redis_url {
|
||||
None => {
|
||||
bail!("plain auth requires redis_notifications to be set");
|
||||
}
|
||||
Some(url) => Some(
|
||||
ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()),
|
||||
let regional_redis_client = match (args.redis_host, args.redis_port) {
|
||||
(Some(host), Some(port)) => Some(
|
||||
ConnectionWithCredentialsProvider::new_with_credentials_provider(
|
||||
host,
|
||||
port,
|
||||
elasticache_credentials_provider.clone(),
|
||||
),
|
||||
},
|
||||
("irsa", _) => match (&args.redis_host, args.redis_port) {
|
||||
(Some(host), Some(port)) => Some(
|
||||
ConnectionWithCredentialsProvider::new_with_credentials_provider(
|
||||
host.to_string(),
|
||||
port,
|
||||
elasticache_credentials_provider.clone(),
|
||||
),
|
||||
),
|
||||
(None, None) => {
|
||||
warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client");
|
||||
None
|
||||
}
|
||||
_ => {
|
||||
bail!("redis-host and redis-port must be specified together");
|
||||
}
|
||||
},
|
||||
),
|
||||
(None, None) => {
|
||||
warn!("Redis events from console are disabled");
|
||||
None
|
||||
}
|
||||
_ => {
|
||||
bail!("unknown auth type given");
|
||||
bail!("redis-host and redis-port must be specified together");
|
||||
}
|
||||
};
|
||||
|
||||
let redis_notifications_client = if let Some(url) = args.redis_notifications {
|
||||
Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()))
|
||||
Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
|
||||
} else {
|
||||
regional_redis_client.clone()
|
||||
};
|
||||
@@ -376,7 +357,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let cancel_map = CancelMap::default();
|
||||
|
||||
let redis_publisher = match &redis_notifications_client {
|
||||
let redis_publisher = match ®ional_redis_client {
|
||||
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
|
||||
redis_publisher.clone(),
|
||||
args.region.clone(),
|
||||
@@ -392,24 +373,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
proxy::metrics::CancellationSource::FromClient,
|
||||
));
|
||||
|
||||
// bit of a hack - find the min rps and max rps supported and turn it into
|
||||
// leaky bucket config instead
|
||||
let max = args
|
||||
.endpoint_rps_limit
|
||||
.iter()
|
||||
.map(|x| x.rps())
|
||||
.max_by(f64::total_cmp)
|
||||
.unwrap_or(EndpointRateLimiter::DEFAULT.max);
|
||||
let rps = args
|
||||
.endpoint_rps_limit
|
||||
.iter()
|
||||
.map(|x| x.rps())
|
||||
.min_by(f64::total_cmp)
|
||||
.unwrap_or(EndpointRateLimiter::DEFAULT.rps);
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
|
||||
LeakyBucketConfig { rps, max },
|
||||
64,
|
||||
));
|
||||
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
|
||||
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
|
||||
|
||||
// client facing tasks. these will exit on error or on cancellation
|
||||
// cancellation returns Ok(())
|
||||
@@ -611,7 +577,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
|
||||
RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
|
||||
let wake_compute_endpoint_rate_limiter =
|
||||
Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
|
||||
Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit));
|
||||
let api = console::provider::neon::Api::new(
|
||||
endpoint,
|
||||
caches,
|
||||
|
||||
@@ -103,12 +103,8 @@ impl ConnCfg {
|
||||
|
||||
/// Reuse password or auth keys from the other config.
|
||||
pub fn reuse_password(&mut self, other: Self) {
|
||||
if let Some(password) = other.get_password() {
|
||||
self.password(password);
|
||||
}
|
||||
|
||||
if let Some(keys) = other.get_auth_keys() {
|
||||
self.auth_keys(keys);
|
||||
if let Some(password) = other.get_auth() {
|
||||
self.auth(password);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,48 +120,64 @@ impl ConnCfg {
|
||||
|
||||
/// Apply startup message params to the connection config.
|
||||
pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
|
||||
// Only set `user` if it's not present in the config.
|
||||
// Link auth flow takes username from the console's response.
|
||||
if let (None, Some(user)) = (self.get_user(), params.get("user")) {
|
||||
self.user(user);
|
||||
}
|
||||
|
||||
// Only set `dbname` if it's not present in the config.
|
||||
// Link auth flow takes dbname from the console's response.
|
||||
if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
|
||||
self.dbname(dbname);
|
||||
}
|
||||
|
||||
// Don't add `options` if they were only used for specifying a project.
|
||||
// Connection pools don't support `options`, because they affect backend startup.
|
||||
if let Some(options) = filtered_options(params) {
|
||||
self.options(&options);
|
||||
}
|
||||
|
||||
if let Some(app_name) = params.get("application_name") {
|
||||
self.application_name(app_name);
|
||||
}
|
||||
|
||||
// TODO: This is especially ugly...
|
||||
if let Some(replication) = params.get("replication") {
|
||||
use tokio_postgres::config::ReplicationMode;
|
||||
match replication {
|
||||
"true" | "on" | "yes" | "1" => {
|
||||
self.replication_mode(ReplicationMode::Physical);
|
||||
let mut client_encoding = false;
|
||||
for (k, v) in params.iter() {
|
||||
match k {
|
||||
"user" => {
|
||||
// Only set `user` if it's not present in the config.
|
||||
// Link auth flow takes username from the console's response.
|
||||
if self.get_user().is_none() {
|
||||
self.user(v);
|
||||
}
|
||||
}
|
||||
"database" => {
|
||||
self.replication_mode(ReplicationMode::Logical);
|
||||
// Only set `dbname` if it's not present in the config.
|
||||
// Link auth flow takes dbname from the console's response.
|
||||
if self.get_dbname().is_none() {
|
||||
self.dbname(v);
|
||||
}
|
||||
}
|
||||
"options" => {
|
||||
// Don't add `options` if they were only used for specifying a project.
|
||||
// Connection pools don't support `options`, because they affect backend startup.
|
||||
if let Some(options) = filtered_options(v) {
|
||||
self.options(&options);
|
||||
}
|
||||
}
|
||||
|
||||
// the special ones in tokio-postgres that we don't want being set by the user
|
||||
"dbname" => {}
|
||||
"password" => {}
|
||||
"sslmode" => {}
|
||||
"host" => {}
|
||||
"port" => {}
|
||||
"connect_timeout" => {}
|
||||
"keepalives" => {}
|
||||
"keepalives_idle" => {}
|
||||
"keepalives_interval" => {}
|
||||
"keepalives_retries" => {}
|
||||
"target_session_attrs" => {}
|
||||
"channel_binding" => {}
|
||||
"max_backend_message_size" => {}
|
||||
|
||||
"client_encoding" => {
|
||||
client_encoding = true;
|
||||
// only error should be from bad null bytes,
|
||||
// but we've already checked for those.
|
||||
_ = self.param("client_encoding", v);
|
||||
}
|
||||
|
||||
_ => {
|
||||
// only error should be from bad null bytes,
|
||||
// but we've already checked for those.
|
||||
_ = self.param(k, v);
|
||||
}
|
||||
_other => {}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: extend the list of the forwarded startup parameters.
|
||||
// Currently, tokio-postgres doesn't allow us to pass
|
||||
// arbitrary parameters, but the ones above are a good start.
|
||||
//
|
||||
// This and the reverse params problem can be better addressed
|
||||
// in a bespoke connection machinery (a new library for that sake).
|
||||
if !client_encoding {
|
||||
// for compatibility since we removed it from tokio-postgres
|
||||
self.param("client_encoding", "UTF8").unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -338,10 +350,9 @@ impl ConnCfg {
|
||||
}
|
||||
|
||||
/// Retrieve `options` from a startup message, dropping all proxy-secific flags.
|
||||
fn filtered_options(params: &StartupMessageParams) -> Option<String> {
|
||||
fn filtered_options(options: &str) -> Option<String> {
|
||||
#[allow(unstable_name_collisions)]
|
||||
let options: String = params
|
||||
.options_raw()?
|
||||
let options: String = StartupMessageParams::parse_options_raw(options)
|
||||
.filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
|
||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||
.collect();
|
||||
@@ -413,27 +424,23 @@ mod tests {
|
||||
#[test]
|
||||
fn test_filtered_options() {
|
||||
// Empty options is unlikely to be useful anyway.
|
||||
let params = StartupMessageParams::new([("options", "")]);
|
||||
assert_eq!(filtered_options(¶ms), None);
|
||||
assert_eq!(filtered_options(""), None);
|
||||
|
||||
// It's likely that clients will only use options to specify endpoint/project.
|
||||
let params = StartupMessageParams::new([("options", "project=foo")]);
|
||||
assert_eq!(filtered_options(¶ms), None);
|
||||
let params = "project=foo";
|
||||
assert_eq!(filtered_options(params), None);
|
||||
|
||||
// Same, because unescaped whitespaces are no-op.
|
||||
let params = StartupMessageParams::new([("options", " project=foo ")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), None);
|
||||
let params = " project=foo ";
|
||||
assert_eq!(filtered_options(params), None);
|
||||
|
||||
let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ "));
|
||||
let params = r"\ project=foo \ ";
|
||||
assert_eq!(filtered_options(params).as_deref(), Some(r"\ \ "));
|
||||
|
||||
let params = StartupMessageParams::new([("options", "project = foo")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
||||
let params = "project = foo";
|
||||
assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
|
||||
|
||||
let params = StartupMessageParams::new([(
|
||||
"options",
|
||||
"project = foo neon_endpoint_type:read_write neon_lsn:0/2",
|
||||
)]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
||||
let params = "project = foo neon_endpoint_type:read_write neon_lsn:0/2";
|
||||
assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::{
|
||||
console::messages::{ColdStartInfo, Reason},
|
||||
http,
|
||||
metrics::{CacheOutcome, Metrics},
|
||||
rate_limiter::WakeComputeRateLimiter,
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
scram, EndpointCacheKey,
|
||||
};
|
||||
use crate::{cache::Cached, context::RequestMonitoring};
|
||||
@@ -26,7 +26,7 @@ pub struct Api {
|
||||
endpoint: http::Endpoint,
|
||||
pub caches: &'static ApiCaches,
|
||||
pub locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
pub wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
|
||||
pub wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
jwt: String,
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ impl Api {
|
||||
endpoint: http::Endpoint,
|
||||
caches: &'static ApiCaches,
|
||||
locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
|
||||
wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> Self {
|
||||
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
|
||||
@@ -3,8 +3,4 @@ mod limiter;
|
||||
pub use limit_algorithm::{
|
||||
aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
|
||||
};
|
||||
pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
|
||||
mod leaky_bucket;
|
||||
pub use leaky_bucket::{
|
||||
EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
|
||||
};
|
||||
pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
|
||||
|
||||
@@ -1,171 +0,0 @@
|
||||
use std::{
|
||||
hash::Hash,
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
};
|
||||
|
||||
use ahash::RandomState;
|
||||
use dashmap::DashMap;
|
||||
use rand::{thread_rng, Rng};
|
||||
use tokio::time::Instant;
|
||||
use tracing::info;
|
||||
|
||||
use crate::intern::EndpointIdInt;
|
||||
|
||||
// Simple per-endpoint rate limiter.
|
||||
pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
|
||||
|
||||
pub struct LeakyBucketRateLimiter<Key> {
|
||||
map: DashMap<Key, LeakyBucketState, RandomState>,
|
||||
config: LeakyBucketConfig,
|
||||
access_count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
|
||||
pub const DEFAULT: LeakyBucketConfig = LeakyBucketConfig {
|
||||
rps: 600.0,
|
||||
max: 1500.0,
|
||||
};
|
||||
|
||||
pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
|
||||
Self {
|
||||
map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
|
||||
config,
|
||||
access_count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check that number of connections to the endpoint is below `max_rps` rps.
|
||||
pub fn check(&self, key: K, n: u32) -> bool {
|
||||
let now = Instant::now();
|
||||
|
||||
if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
|
||||
self.do_gc(now);
|
||||
}
|
||||
|
||||
let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState {
|
||||
time: now,
|
||||
filled: 0.0,
|
||||
});
|
||||
|
||||
entry.check(&self.config, now, n as f64)
|
||||
}
|
||||
|
||||
fn do_gc(&self, now: Instant) {
|
||||
info!(
|
||||
"cleaning up bucket rate limiter, current size = {}",
|
||||
self.map.len()
|
||||
);
|
||||
let n = self.map.shards().len();
|
||||
let shard = thread_rng().gen_range(0..n);
|
||||
self.map.shards()[shard]
|
||||
.write()
|
||||
.retain(|_, value| !value.get_mut().update(&self.config, now));
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LeakyBucketConfig {
|
||||
pub rps: f64,
|
||||
pub max: f64,
|
||||
}
|
||||
|
||||
pub struct LeakyBucketState {
|
||||
filled: f64,
|
||||
time: Instant,
|
||||
}
|
||||
|
||||
impl LeakyBucketConfig {
|
||||
pub fn new(rps: f64, max: f64) -> Self {
|
||||
assert!(rps > 0.0, "rps must be positive");
|
||||
assert!(max > 0.0, "max must be positive");
|
||||
Self { rps, max }
|
||||
}
|
||||
}
|
||||
|
||||
impl LeakyBucketState {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
filled: 0.0,
|
||||
time: Instant::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// updates the timer and returns true if the bucket is empty
|
||||
fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool {
|
||||
let drain = now.duration_since(self.time);
|
||||
let drain = drain.as_secs_f64() * info.rps;
|
||||
|
||||
self.filled = (self.filled - drain).clamp(0.0, info.max);
|
||||
self.time = now;
|
||||
|
||||
self.filled == 0.0
|
||||
}
|
||||
|
||||
pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
|
||||
self.update(info, now);
|
||||
|
||||
if self.filled + n > info.max {
|
||||
return false;
|
||||
}
|
||||
self.filled += n;
|
||||
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LeakyBucketState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::time::Instant;
|
||||
|
||||
use super::{LeakyBucketConfig, LeakyBucketState};
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn check() {
|
||||
let info = LeakyBucketConfig::new(500.0, 2000.0);
|
||||
let mut bucket = LeakyBucketState::new();
|
||||
|
||||
// should work for 2000 requests this second
|
||||
for _ in 0..2000 {
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
}
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
assert_eq!(bucket.filled, 2000.0);
|
||||
|
||||
// in 1ms we should drain 0.5 tokens.
|
||||
// make sure we don't lose any tokens
|
||||
tokio::time::advance(Duration::from_millis(1)).await;
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
tokio::time::advance(Duration::from_millis(1)).await;
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
|
||||
// in 10ms we should drain 5 tokens
|
||||
tokio::time::advance(Duration::from_millis(10)).await;
|
||||
for _ in 0..5 {
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
}
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
|
||||
// in 10s we should drain 5000 tokens
|
||||
// but cap is only 2000
|
||||
tokio::time::advance(Duration::from_secs(10)).await;
|
||||
for _ in 0..2000 {
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
}
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
|
||||
// should sustain 500rps
|
||||
for _ in 0..2000 {
|
||||
tokio::time::advance(Duration::from_millis(10)).await;
|
||||
for _ in 0..5 {
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -61,7 +61,7 @@ impl GlobalRateLimiter {
|
||||
// Purposefully ignore user name and database name as clients can reconnect
|
||||
// with different names, so we'll end up sending some http requests to
|
||||
// the control plane.
|
||||
pub type WakeComputeRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
|
||||
pub type EndpointRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
|
||||
|
||||
pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
|
||||
map: DashMap<Key, Vec<RateBucket>, Hasher>,
|
||||
@@ -103,7 +103,7 @@ pub struct RateBucketInfo {
|
||||
|
||||
impl std::fmt::Display for RateBucketInfo {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let rps = self.rps().floor() as u64;
|
||||
let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64;
|
||||
write!(f, "{rps}@{}", humantime::format_duration(self.interval))
|
||||
}
|
||||
}
|
||||
@@ -140,10 +140,6 @@ impl RateBucketInfo {
|
||||
Self::new(200, Duration::from_secs(600)),
|
||||
];
|
||||
|
||||
pub fn rps(&self) -> f64 {
|
||||
(self.max_rpi as f64) / self.interval.as_secs_f64()
|
||||
}
|
||||
|
||||
pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
|
||||
info.sort_unstable_by_key(|info| info.interval);
|
||||
let invalid = info
|
||||
@@ -249,7 +245,7 @@ mod tests {
|
||||
use rustc_hash::FxHasher;
|
||||
use tokio::time;
|
||||
|
||||
use super::{BucketRateLimiter, WakeComputeRateLimiter};
|
||||
use super::{BucketRateLimiter, EndpointRateLimiter};
|
||||
use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId};
|
||||
|
||||
#[test]
|
||||
@@ -297,7 +293,7 @@ mod tests {
|
||||
.map(|s| s.parse().unwrap())
|
||||
.collect();
|
||||
RateBucketInfo::validate(&mut rates).unwrap();
|
||||
let limiter = WakeComputeRateLimiter::new(rates);
|
||||
let limiter = EndpointRateLimiter::new(rates);
|
||||
|
||||
let endpoint = EndpointId::from("ep-my-endpoint-1234");
|
||||
let endpoint = EndpointIdInt::from(endpoint);
|
||||
|
||||
@@ -236,6 +236,10 @@ impl ConnectMechanism for TokioMechanism {
|
||||
.dbname(&self.conn_info.dbname)
|
||||
.connect_timeout(timeout);
|
||||
|
||||
config
|
||||
.param("client_encoding", "UTF8")
|
||||
.expect("client encoding UTF8 is always valid");
|
||||
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
|
||||
let res = config.connect(tokio_postgres::NoTls).await;
|
||||
drop(pause);
|
||||
|
||||
@@ -203,6 +203,7 @@ fn get_conn_info(
|
||||
options = Some(NeonOptions::parse_options_raw(&value));
|
||||
}
|
||||
}
|
||||
ctx.set_db_options(params.freeze());
|
||||
|
||||
let user_info = ComputeUserInfo {
|
||||
endpoint,
|
||||
|
||||
@@ -828,8 +828,6 @@ impl Service {
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
// Transition to active involves reconciling: if a node responds to a heartbeat then
|
||||
// becomes unavailable again, we may get an error here.
|
||||
tracing::error!(
|
||||
"Failed to update node {} after heartbeat round: {}",
|
||||
node_id,
|
||||
|
||||
@@ -49,5 +49,6 @@ tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
clap.workspace = true
|
||||
tracing-appender = "0.2"
|
||||
histogram = "0.7"
|
||||
|
||||
futures.workspace = true
|
||||
|
||||
@@ -45,11 +45,7 @@ processing by the `purge-garbage` subcommand.
|
||||
|
||||
Example:
|
||||
|
||||
`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=[client_key] CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
|
||||
|
||||
Note that `CLOUD_ADMIN_API_TOKEN` can be obtained from https://console-stage.neon.build/app/settings/api-keys (for staging) or https://console.neon.tech/app/settings/api-keys for production. This is not the control plane admin JWT key. The env var name is confusing. Though anyone can generate that API key, you still need admin permission in order to access all projects in the region.
|
||||
|
||||
And note that `CLOUD_ADMIN_API_URL` should include the region in the admin URL due to the control plane / console split. For example, `https://console-stage.neon.build/regions/aws-us-east-2/api/v1/admin` for the staging us-east-2 region.
|
||||
`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
|
||||
|
||||
#### `purge-garbage`
|
||||
|
||||
@@ -65,7 +61,7 @@ to pass them on the command line
|
||||
|
||||
Example:
|
||||
|
||||
`env AWS_PROFILE=dev cargo run --release -- purge-garbage --input-path=eu-west-1-garbage.json`
|
||||
`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
|
||||
|
||||
Add the `--delete` argument before `purge-garbage` to enable deletion. This is intentionally
|
||||
not provided inline in the example above to avoid accidents. Without the `--delete` flag
|
||||
|
||||
@@ -2,7 +2,6 @@ use std::collections::{HashMap, HashSet};
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_sdk_s3::Client;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use tracing::{error, info, warn};
|
||||
@@ -13,7 +12,7 @@ use crate::cloud_admin_api::BranchData;
|
||||
use crate::metadata_stream::stream_listing;
|
||||
use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
|
||||
use futures_util::StreamExt;
|
||||
use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
|
||||
use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use remote_storage::RemotePath;
|
||||
@@ -42,9 +41,7 @@ impl TimelineAnalysis {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
s3_client: &Client,
|
||||
target: &RootTarget,
|
||||
pub(crate) fn branch_cleanup_and_check_errors(
|
||||
id: &TenantShardTimelineId,
|
||||
tenant_objects: &mut TenantObjectListing,
|
||||
s3_active_branch: Option<&BranchData>,
|
||||
@@ -88,17 +85,15 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
}
|
||||
|
||||
if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
|
||||
info!(
|
||||
result.warnings.push(format!(
|
||||
"index_part.json version is not latest: {}",
|
||||
index_part.version()
|
||||
);
|
||||
))
|
||||
}
|
||||
|
||||
if index_part.metadata.disk_consistent_lsn()
|
||||
!= index_part.duplicated_disk_consistent_lsn()
|
||||
{
|
||||
// Tech debt: let's get rid of one of these, they are redundant
|
||||
// https://github.com/neondatabase/neon/issues/8343
|
||||
result.errors.push(format!(
|
||||
"Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})",
|
||||
index_part.metadata.disk_consistent_lsn(),
|
||||
@@ -107,16 +102,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
}
|
||||
|
||||
if index_part.layer_metadata.is_empty() {
|
||||
if index_part.metadata.ancestor_timeline().is_none() {
|
||||
// The initial timeline with no ancestor should ALWAYS have layers.
|
||||
result.errors.push(
|
||||
"index_part.json has no layers (ancestor_timeline=None)"
|
||||
.to_string(),
|
||||
);
|
||||
} else {
|
||||
// Not an error, can happen for branches with zero writes, but notice that
|
||||
info!("index_part.json has no layers (ancestor_timeline exists)");
|
||||
}
|
||||
// not an error, can happen for branches with zero writes, but notice that
|
||||
info!("index_part.json has no layers");
|
||||
}
|
||||
|
||||
for (layer, metadata) in index_part.layer_metadata {
|
||||
@@ -127,41 +114,16 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
}
|
||||
|
||||
if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) {
|
||||
let path = remote_layer_path(
|
||||
&id.tenant_shard_id.tenant_id,
|
||||
&id.timeline_id,
|
||||
metadata.shard,
|
||||
&layer,
|
||||
metadata.generation,
|
||||
);
|
||||
|
||||
// HEAD request used here to address a race condition when an index was uploaded concurrently
|
||||
// with our scan. We check if the object is uploaded to S3 after taking the listing snapshot.
|
||||
let response = s3_client
|
||||
.head_object()
|
||||
.bucket(target.bucket_name())
|
||||
.key(path.get_path().as_str())
|
||||
.send()
|
||||
.await;
|
||||
|
||||
if response.is_err() {
|
||||
// Object is not present.
|
||||
let is_l0 = LayerMap::is_l0(layer.key_range());
|
||||
|
||||
let msg = format!(
|
||||
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
|
||||
layer,
|
||||
metadata.generation.get_suffix(),
|
||||
metadata.shard,
|
||||
is_l0,
|
||||
);
|
||||
|
||||
if is_l0 {
|
||||
result.warnings.push(msg);
|
||||
} else {
|
||||
result.errors.push(msg);
|
||||
}
|
||||
}
|
||||
// FIXME: this will emit false positives if an index was
|
||||
// uploaded concurrently with our scan. To make this check
|
||||
// correct, we need to try sending a HEAD request for the
|
||||
// layer we think is missing.
|
||||
result.errors.push(format!(
|
||||
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
|
||||
layer,
|
||||
metadata.generation.get_suffix(),
|
||||
metadata.shard
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -341,9 +303,6 @@ pub(crate) async fn list_timeline_blobs(
|
||||
tracing::debug!("initdb archive {key}");
|
||||
initdb_archive = true;
|
||||
}
|
||||
Some("initdb-preserved.tar.zst") => {
|
||||
tracing::info!("initdb archive preserved {key}");
|
||||
}
|
||||
Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
|
||||
Ok((new_layer, gen)) => {
|
||||
tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
|
||||
|
||||
@@ -8,19 +8,21 @@ use std::{
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_sdk_s3::{
|
||||
types::{Delete, ObjectIdentifier},
|
||||
Client,
|
||||
};
|
||||
use futures_util::TryStreamExt;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::{
|
||||
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
|
||||
init_remote, init_remote_generic,
|
||||
metadata_stream::{stream_tenant_timelines, stream_tenants},
|
||||
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
|
||||
init_remote,
|
||||
metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants},
|
||||
BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth,
|
||||
};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -322,45 +324,41 @@ impl std::fmt::Display for PurgeMode {
|
||||
}
|
||||
|
||||
pub async fn get_tenant_objects(
|
||||
s3_client: &GenericRemoteStorage,
|
||||
s3_client: &Arc<Client>,
|
||||
target: RootTarget,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> anyhow::Result<Vec<RemotePath>> {
|
||||
) -> anyhow::Result<Vec<ObjectIdentifier>> {
|
||||
tracing::debug!("Listing objects in tenant {tenant_shard_id}");
|
||||
let tenant_root = super::remote_tenant_path(&tenant_shard_id);
|
||||
|
||||
// TODO: apply extra validation based on object modification time. Don't purge
|
||||
// tenants where any timeline's index_part.json has been touched recently.
|
||||
|
||||
let list = s3_client
|
||||
.list(
|
||||
Some(&tenant_root),
|
||||
ListingMode::NoDelimiter,
|
||||
None,
|
||||
&CancellationToken::new(),
|
||||
)
|
||||
.await?;
|
||||
Ok(list.keys)
|
||||
let mut tenant_root = target.tenant_root(&tenant_shard_id);
|
||||
|
||||
// Remove delimiter, so that object listing lists all keys in the prefix and not just
|
||||
// common prefixes.
|
||||
tenant_root.delimiter = String::new();
|
||||
|
||||
let key_stream = stream_listing(s3_client, &tenant_root);
|
||||
key_stream.try_collect().await
|
||||
}
|
||||
|
||||
pub async fn get_timeline_objects(
|
||||
s3_client: &GenericRemoteStorage,
|
||||
s3_client: &Arc<Client>,
|
||||
target: RootTarget,
|
||||
ttid: TenantShardTimelineId,
|
||||
) -> anyhow::Result<Vec<RemotePath>> {
|
||||
) -> anyhow::Result<Vec<ObjectIdentifier>> {
|
||||
tracing::debug!("Listing objects in timeline {ttid}");
|
||||
let timeline_root = super::remote_timeline_path_id(&ttid);
|
||||
let mut timeline_root = target.timeline_root(&ttid);
|
||||
|
||||
// TODO: apply extra validation based on object modification time. Don't purge
|
||||
// timelines whose index_part.json has been touched recently.
|
||||
|
||||
let list = s3_client
|
||||
.list(
|
||||
Some(&timeline_root),
|
||||
ListingMode::NoDelimiter,
|
||||
None,
|
||||
&CancellationToken::new(),
|
||||
)
|
||||
.await?;
|
||||
Ok(list.keys)
|
||||
// Remove delimiter, so that object listing lists all keys in the prefix and not just
|
||||
// common prefixes.
|
||||
timeline_root.delimiter = String::new();
|
||||
let key_stream = stream_listing(s3_client, &timeline_root);
|
||||
|
||||
key_stream.try_collect().await
|
||||
}
|
||||
|
||||
const MAX_KEYS_PER_DELETE: usize = 1000;
|
||||
@@ -371,17 +369,16 @@ const MAX_KEYS_PER_DELETE: usize = 1000;
|
||||
/// MAX_KEYS_PER_DELETE keys are left.
|
||||
/// `num_deleted` returns number of deleted keys.
|
||||
async fn do_delete(
|
||||
remote_client: &GenericRemoteStorage,
|
||||
keys: &mut Vec<RemotePath>,
|
||||
s3_client: &Arc<Client>,
|
||||
bucket_name: &str,
|
||||
keys: &mut Vec<ObjectIdentifier>,
|
||||
dry_run: bool,
|
||||
drain: bool,
|
||||
progress_tracker: &mut DeletionProgressTracker,
|
||||
) -> anyhow::Result<()> {
|
||||
let cancel = CancellationToken::new();
|
||||
while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
|
||||
let request_keys =
|
||||
keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
|
||||
|
||||
let num_deleted = request_keys.len();
|
||||
if dry_run {
|
||||
tracing::info!("Dry-run deletion of objects: ");
|
||||
@@ -389,10 +386,14 @@ async fn do_delete(
|
||||
tracing::info!(" {k:?}");
|
||||
}
|
||||
} else {
|
||||
remote_client
|
||||
.delete_objects(&request_keys, &cancel)
|
||||
let delete_request = s3_client
|
||||
.delete_objects()
|
||||
.bucket(bucket_name)
|
||||
.delete(Delete::builder().set_objects(Some(request_keys)).build()?);
|
||||
delete_request
|
||||
.send()
|
||||
.await
|
||||
.context("deletetion request")?;
|
||||
.context("DeleteObjects request")?;
|
||||
progress_tracker.register(num_deleted);
|
||||
}
|
||||
}
|
||||
@@ -430,13 +431,8 @@ pub async fn purge_garbage(
|
||||
input_path
|
||||
);
|
||||
|
||||
let remote_client =
|
||||
init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
|
||||
|
||||
assert_eq!(
|
||||
&garbage_list.bucket_config.bucket,
|
||||
remote_client.bucket_name().unwrap()
|
||||
);
|
||||
let (s3_client, target) =
|
||||
init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
|
||||
|
||||
// Sanity checks on the incoming list
|
||||
if garbage_list.active_tenant_count == 0 {
|
||||
@@ -468,13 +464,16 @@ pub async fn purge_garbage(
|
||||
|
||||
let items = tokio_stream::iter(filtered_items.map(Ok));
|
||||
let get_objects_results = items.map_ok(|i| {
|
||||
let remote_client = remote_client.clone();
|
||||
let s3_client = s3_client.clone();
|
||||
let target = target.clone();
|
||||
async move {
|
||||
match i.entity {
|
||||
GarbageEntity::Tenant(tenant_id) => {
|
||||
get_tenant_objects(&remote_client, tenant_id).await
|
||||
get_tenant_objects(&s3_client, target, tenant_id).await
|
||||
}
|
||||
GarbageEntity::Timeline(ttid) => {
|
||||
get_timeline_objects(&s3_client, target, ttid).await
|
||||
}
|
||||
GarbageEntity::Timeline(ttid) => get_timeline_objects(&remote_client, ttid).await,
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -488,7 +487,8 @@ pub async fn purge_garbage(
|
||||
objects_to_delete.append(&mut object_list);
|
||||
if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
|
||||
do_delete(
|
||||
&remote_client,
|
||||
&s3_client,
|
||||
&garbage_list.bucket_config.bucket,
|
||||
&mut objects_to_delete,
|
||||
dry_run,
|
||||
false,
|
||||
@@ -499,7 +499,8 @@ pub async fn purge_garbage(
|
||||
}
|
||||
|
||||
do_delete(
|
||||
&remote_client,
|
||||
&s3_client,
|
||||
&garbage_list.bucket_config.bucket,
|
||||
&mut objects_to_delete,
|
||||
dry_run,
|
||||
true,
|
||||
|
||||
@@ -22,13 +22,9 @@ use aws_sdk_s3::Client;
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::ValueEnum;
|
||||
use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
|
||||
use pageserver::tenant::TENANTS_SEGMENT_NAME;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
};
|
||||
use remote_storage::RemotePath;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::AsyncReadExt;
|
||||
@@ -219,10 +215,6 @@ impl RootTarget {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote_timeline_path_id(id: &TenantShardTimelineId) -> RemotePath {
|
||||
remote_timeline_path(&id.tenant_shard_id, &id.timeline_id)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct BucketConfig {
|
||||
@@ -304,7 +296,7 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
|
||||
}
|
||||
}
|
||||
|
||||
async fn init_s3_client(bucket_region: Region) -> Client {
|
||||
pub async fn init_s3_client(bucket_region: Region) -> Client {
|
||||
let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
|
||||
.region(bucket_region)
|
||||
.load()
|
||||
@@ -312,13 +304,6 @@ async fn init_s3_client(bucket_region: Region) -> Client {
|
||||
Client::new(&config)
|
||||
}
|
||||
|
||||
fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
|
||||
match node_kind {
|
||||
NodeKind::Pageserver => "pageserver/v1/",
|
||||
NodeKind::Safekeeper => "wal/",
|
||||
}
|
||||
}
|
||||
|
||||
async fn init_remote(
|
||||
bucket_config: BucketConfig,
|
||||
node_kind: NodeKind,
|
||||
@@ -326,17 +311,18 @@ async fn init_remote(
|
||||
let bucket_region = Region::new(bucket_config.region);
|
||||
let delimiter = "/".to_string();
|
||||
let s3_client = Arc::new(init_s3_client(bucket_region).await);
|
||||
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
|
||||
|
||||
let s3_root = match node_kind {
|
||||
NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
|
||||
bucket_name: bucket_config.bucket,
|
||||
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
|
||||
prefix_in_bucket: bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("pageserver/v1".to_string()),
|
||||
delimiter,
|
||||
}),
|
||||
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
|
||||
bucket_name: bucket_config.bucket,
|
||||
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
|
||||
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
|
||||
delimiter,
|
||||
}),
|
||||
};
|
||||
@@ -344,31 +330,6 @@ async fn init_remote(
|
||||
Ok((s3_client, s3_root))
|
||||
}
|
||||
|
||||
async fn init_remote_generic(
|
||||
bucket_config: BucketConfig,
|
||||
node_kind: NodeKind,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
let endpoint = env::var("AWS_ENDPOINT_URL").ok();
|
||||
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
|
||||
let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
|
||||
let storage = S3Config {
|
||||
bucket_name: bucket_config.bucket,
|
||||
bucket_region: bucket_config.region,
|
||||
prefix_in_bucket,
|
||||
endpoint,
|
||||
concurrency_limit: DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
|
||||
.try_into()
|
||||
.unwrap(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
upload_storage_class: None,
|
||||
};
|
||||
let storage_config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AwsS3(storage),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
GenericRemoteStorage::from_config(&storage_config).await
|
||||
}
|
||||
|
||||
async fn list_objects_with_retries(
|
||||
s3_client: &Client,
|
||||
s3_target: &S3Target,
|
||||
|
||||
@@ -8,11 +8,12 @@ use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
|
||||
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
|
||||
use aws_sdk_s3::Client;
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use histogram::Histogram;
|
||||
use pageserver::tenant::remote_timeline_client::remote_layer_path;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use serde::Serialize;
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::ShardCount;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct MetadataSummary {
|
||||
@@ -23,6 +24,66 @@ pub struct MetadataSummary {
|
||||
with_warnings: HashSet<TenantShardTimelineId>,
|
||||
with_orphans: HashSet<TenantShardTimelineId>,
|
||||
indices_by_version: HashMap<usize, usize>,
|
||||
|
||||
layer_count: MinMaxHisto,
|
||||
timeline_size_bytes: MinMaxHisto,
|
||||
layer_size_bytes: MinMaxHisto,
|
||||
}
|
||||
|
||||
/// A histogram plus minimum and maximum tracking
|
||||
#[derive(Serialize)]
|
||||
struct MinMaxHisto {
|
||||
#[serde(skip)]
|
||||
histo: Histogram,
|
||||
min: u64,
|
||||
max: u64,
|
||||
}
|
||||
|
||||
impl MinMaxHisto {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
histo: histogram::Histogram::builder()
|
||||
.build()
|
||||
.expect("Bad histogram params"),
|
||||
min: u64::MAX,
|
||||
max: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn sample(&mut self, v: u64) -> Result<(), histogram::Error> {
|
||||
self.min = std::cmp::min(self.min, v);
|
||||
self.max = std::cmp::max(self.max, v);
|
||||
let r = self.histo.increment(v, 1);
|
||||
|
||||
if r.is_err() {
|
||||
tracing::warn!("Bad histogram sample: {v}");
|
||||
}
|
||||
|
||||
r
|
||||
}
|
||||
|
||||
fn oneline(&self) -> String {
|
||||
let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) {
|
||||
Ok(p) => p,
|
||||
Err(e) => return format!("No data: {}", e),
|
||||
};
|
||||
|
||||
let percentiles: Vec<u64> = percentiles
|
||||
.iter()
|
||||
.map(|p| p.bucket().low() + p.bucket().high() / 2)
|
||||
.collect();
|
||||
|
||||
format!(
|
||||
"min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}",
|
||||
self.min,
|
||||
percentiles[0],
|
||||
percentiles[1],
|
||||
percentiles[2],
|
||||
percentiles[3],
|
||||
percentiles[4],
|
||||
self.max,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl MetadataSummary {
|
||||
@@ -35,9 +96,25 @@ impl MetadataSummary {
|
||||
with_warnings: HashSet::new(),
|
||||
with_orphans: HashSet::new(),
|
||||
indices_by_version: HashMap::new(),
|
||||
layer_count: MinMaxHisto::new(),
|
||||
timeline_size_bytes: MinMaxHisto::new(),
|
||||
layer_size_bytes: MinMaxHisto::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> {
|
||||
self.layer_count
|
||||
.sample(index_part.layer_metadata.len() as u64)?;
|
||||
let mut total_size: u64 = 0;
|
||||
for meta in index_part.layer_metadata.values() {
|
||||
total_size += meta.file_size;
|
||||
self.layer_size_bytes.sample(meta.file_size)?;
|
||||
}
|
||||
self.timeline_size_bytes.sample(total_size)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_data(&mut self, data: &S3TimelineBlobData) {
|
||||
self.timeline_shard_count += 1;
|
||||
if let BlobDataParseResult::Parsed {
|
||||
@@ -50,6 +127,14 @@ impl MetadataSummary {
|
||||
.indices_by_version
|
||||
.entry(index_part.version())
|
||||
.or_insert(0) += 1;
|
||||
|
||||
if let Err(e) = self.update_histograms(index_part) {
|
||||
// Value out of range? Warn that the results are untrustworthy
|
||||
tracing::warn!(
|
||||
"Error updating histograms, summary stats may be wrong: {}",
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,6 +169,9 @@ With errors: {}
|
||||
With warnings: {}
|
||||
With orphan layers: {}
|
||||
Index versions: {version_summary}
|
||||
Timeline size bytes: {}
|
||||
Layer size bytes: {}
|
||||
Timeline layer count: {}
|
||||
",
|
||||
self.tenant_count,
|
||||
self.timeline_count,
|
||||
@@ -91,6 +179,9 @@ Index versions: {version_summary}
|
||||
self.with_errors.len(),
|
||||
self.with_warnings.len(),
|
||||
self.with_orphans.len(),
|
||||
self.timeline_size_bytes.oneline(),
|
||||
self.layer_size_bytes.oneline(),
|
||||
self.layer_count.oneline(),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -144,60 +235,33 @@ pub async fn scan_metadata(
|
||||
let mut tenant_objects = TenantObjectListing::default();
|
||||
let mut tenant_timeline_results = Vec::new();
|
||||
|
||||
async fn analyze_tenant(
|
||||
s3_client: &Client,
|
||||
target: &RootTarget,
|
||||
fn analyze_tenant(
|
||||
tenant_id: TenantId,
|
||||
summary: &mut MetadataSummary,
|
||||
mut tenant_objects: TenantObjectListing,
|
||||
timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
|
||||
highest_shard_count: ShardCount,
|
||||
) {
|
||||
summary.tenant_count += 1;
|
||||
|
||||
let mut timeline_ids = HashSet::new();
|
||||
let mut timeline_generations = HashMap::new();
|
||||
for (ttid, data) in timelines {
|
||||
if ttid.tenant_shard_id.shard_count == highest_shard_count {
|
||||
// Only analyze `TenantShardId`s with highest shard count.
|
||||
|
||||
// Stash the generation of each timeline, for later use identifying orphan layers
|
||||
if let BlobDataParseResult::Parsed {
|
||||
index_part,
|
||||
index_part_generation,
|
||||
s3_layers: _s3_layers,
|
||||
} = &data.blob_data
|
||||
{
|
||||
if index_part.deleted_at.is_some() {
|
||||
// skip deleted timeline.
|
||||
tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid);
|
||||
continue;
|
||||
}
|
||||
timeline_generations.insert(ttid, *index_part_generation);
|
||||
}
|
||||
|
||||
// Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
|
||||
// reference counts for layers across the tenant.
|
||||
let analysis = branch_cleanup_and_check_errors(
|
||||
s3_client,
|
||||
target,
|
||||
&ttid,
|
||||
&mut tenant_objects,
|
||||
None,
|
||||
None,
|
||||
Some(data),
|
||||
)
|
||||
.await;
|
||||
summary.update_analysis(&ttid, &analysis);
|
||||
|
||||
timeline_ids.insert(ttid.timeline_id);
|
||||
} else {
|
||||
tracing::info!(
|
||||
"Skip analysis of {} b/c a lower shard count than {}",
|
||||
ttid,
|
||||
highest_shard_count.0,
|
||||
);
|
||||
timeline_ids.insert(ttid.timeline_id);
|
||||
// Stash the generation of each timeline, for later use identifying orphan layers
|
||||
if let BlobDataParseResult::Parsed {
|
||||
index_part: _index_part,
|
||||
index_part_generation,
|
||||
s3_layers: _s3_layers,
|
||||
} = &data.blob_data
|
||||
{
|
||||
timeline_generations.insert(ttid, *index_part_generation);
|
||||
}
|
||||
|
||||
// Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
|
||||
// reference counts for layers across the tenant.
|
||||
let analysis =
|
||||
branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data));
|
||||
summary.update_analysis(&ttid, &analysis);
|
||||
}
|
||||
|
||||
summary.timeline_count += timeline_ids.len();
|
||||
@@ -245,35 +309,18 @@ pub async fn scan_metadata(
|
||||
// all results for the same tenant will be adjacent. We accumulate these,
|
||||
// and then call `analyze_tenant` to flush, when we see the next tenant ID.
|
||||
let mut summary = MetadataSummary::new();
|
||||
let mut highest_shard_count = ShardCount::MIN;
|
||||
while let Some(i) = timelines.next().await {
|
||||
let (ttid, data) = i?;
|
||||
summary.update_data(&data);
|
||||
|
||||
match tenant_id {
|
||||
None => {
|
||||
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
|
||||
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
|
||||
}
|
||||
None => tenant_id = Some(ttid.tenant_shard_id.tenant_id),
|
||||
Some(prev_tenant_id) => {
|
||||
if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
|
||||
// New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results
|
||||
let tenant_objects = std::mem::take(&mut tenant_objects);
|
||||
let timelines = std::mem::take(&mut tenant_timeline_results);
|
||||
analyze_tenant(
|
||||
&s3_client,
|
||||
&target,
|
||||
prev_tenant_id,
|
||||
&mut summary,
|
||||
tenant_objects,
|
||||
timelines,
|
||||
highest_shard_count,
|
||||
)
|
||||
.await;
|
||||
analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines);
|
||||
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
|
||||
highest_shard_count = ttid.tenant_shard_id.shard_count;
|
||||
} else {
|
||||
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -291,15 +338,11 @@ pub async fn scan_metadata(
|
||||
|
||||
if !tenant_timeline_results.is_empty() {
|
||||
analyze_tenant(
|
||||
&s3_client,
|
||||
&target,
|
||||
tenant_id.expect("Must be set if results are present"),
|
||||
&mut summary,
|
||||
tenant_objects,
|
||||
tenant_timeline_results,
|
||||
highest_shard_count,
|
||||
)
|
||||
.await;
|
||||
);
|
||||
}
|
||||
|
||||
Ok(summary)
|
||||
|
||||
@@ -143,9 +143,6 @@ class TimelineId(Id):
|
||||
def __repr__(self) -> str:
|
||||
return f'TimelineId("{self.id.hex()}")'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.id.hex()
|
||||
|
||||
|
||||
# Workaround for compat with python 3.9, which does not have `typing.Self`
|
||||
TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
|
||||
|
||||
@@ -102,7 +102,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
|
||||
# failing to connect to them.
|
||||
".*Call to node.*management API.*failed.*receive body.*",
|
||||
".*Call to node.*management API.*failed.*ReceiveBody.*",
|
||||
".*Failed to update node .+ after heartbeat round.*error sending request for url.*",
|
||||
# Many tests will start up with a node offline
|
||||
".*startup_reconcile: Could not scan node.*",
|
||||
# Tests run in dev mode
|
||||
|
||||
@@ -662,7 +662,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
force_repartition=False,
|
||||
force_image_layer_creation=False,
|
||||
wait_until_uploaded=False,
|
||||
compact: Optional[bool] = None,
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
query = {}
|
||||
@@ -673,9 +672,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
if wait_until_uploaded:
|
||||
query["wait_until_uploaded"] = "true"
|
||||
|
||||
if compact is not None:
|
||||
query["compact"] = "true" if compact else "false"
|
||||
|
||||
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import concurrent.futures
|
||||
import time
|
||||
from typing import Any, Callable, Dict, Tuple
|
||||
|
||||
import fixtures.pageserver.remote_storage
|
||||
@@ -8,6 +9,9 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||
|
||||
|
||||
@@ -42,33 +46,38 @@ def single_timeline(
|
||||
log.info(f"duplicating template tenant {ncopies} times in S3")
|
||||
tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
|
||||
|
||||
# In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
|
||||
# However, on-demand downloads are quite slow ATM.
|
||||
# => do the on-demand downloads in Python.
|
||||
log.info("python-side on-demand download the layer files into local tenant dir")
|
||||
tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
|
||||
fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
|
||||
env, tenant_timelines
|
||||
)
|
||||
|
||||
log.info("attach duplicated tenants to pageserver")
|
||||
# In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
|
||||
# However, on-demand downloads are quite slow ATM.
|
||||
# => do the on-demand downloads in Python.
|
||||
assert ps_http.tenant_list() == []
|
||||
# make the attach fail after it created enough on-disk state to retry loading
|
||||
# the tenant next startup, but before it can start background loops that would start download
|
||||
ps_http.configure_failpoints(("attach-before-activate", "return"))
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*attach failed, setting tenant state to Broken: attach-before-activate.*"
|
||||
)
|
||||
|
||||
def attach(tenant):
|
||||
def attach_broken(tenant):
|
||||
env.pageserver.tenant_attach(
|
||||
tenant,
|
||||
config=template_config.copy(),
|
||||
generation=100,
|
||||
override_storage_controller_generation=True,
|
||||
)
|
||||
time.sleep(0.1)
|
||||
wait_until_tenant_state(ps_http, tenant, "Broken", 10)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
|
||||
executor.map(attach, tenants)
|
||||
executor.map(attach_broken, tenants)
|
||||
|
||||
# Benchmarks will start the pageserver explicitly themselves
|
||||
env.pageserver.stop()
|
||||
env.pageserver.stop(
|
||||
immediate=True
|
||||
) # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
|
||||
tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
|
||||
log.info("python-side on-demand download the layer files into local tenant dir")
|
||||
fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
|
||||
env, tenant_timelines
|
||||
)
|
||||
|
||||
return env
|
||||
|
||||
@@ -12,9 +12,8 @@ import boto3
|
||||
import toml
|
||||
from mypy_boto3_s3 import S3Client
|
||||
|
||||
from fixtures.common_types import TenantId, TenantShardId, TimelineId
|
||||
from fixtures.common_types import TenantId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.common_types import IndexPartDump
|
||||
|
||||
TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
|
||||
TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
|
||||
@@ -266,38 +265,9 @@ class S3Storage:
|
||||
def tenants_path(self) -> str:
|
||||
return f"{self.prefix_in_bucket}/tenants"
|
||||
|
||||
def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str:
|
||||
def tenant_path(self, tenant_id: TenantId) -> str:
|
||||
return f"{self.tenants_path()}/{tenant_id}"
|
||||
|
||||
def timeline_path(
|
||||
self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId
|
||||
) -> str:
|
||||
return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
|
||||
|
||||
def get_latest_index_key(self, index_keys: List[str]) -> str:
|
||||
"""
|
||||
Gets the latest index file key.
|
||||
|
||||
@param index_keys: A list of index keys of different generations.
|
||||
"""
|
||||
|
||||
def parse_gen(index_key: str) -> int:
|
||||
parts = index_key.split("index_part.json-")
|
||||
return int(parts[-1], base=16) if len(parts) == 2 else -1
|
||||
|
||||
return max(index_keys, key=parse_gen)
|
||||
|
||||
def download_index_part(self, index_key: str) -> IndexPartDump:
|
||||
"""
|
||||
Downloads the index content from remote storage.
|
||||
|
||||
@param index_key: index key in remote storage.
|
||||
"""
|
||||
response = self.client.get_object(Bucket=self.bucket_name, Key=index_key)
|
||||
body = response["Body"].read().decode("utf-8")
|
||||
log.info(f"index_part.json: {body}")
|
||||
return IndexPartDump.from_json(json.loads(body))
|
||||
|
||||
def heatmap_key(self, tenant_id: TenantId) -> str:
|
||||
return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
|
||||
|
||||
|
||||
@@ -17,11 +17,13 @@ from performance.pageserver.util import (
|
||||
@pytest.mark.parametrize("duration", [30])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
|
||||
@pytest.mark.parametrize("n_tenants", [10])
|
||||
@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
|
||||
@pytest.mark.timeout(1000)
|
||||
def test_basebackup_with_high_slru_count(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
pg_bin: PgBin,
|
||||
get_vectored_impl: str,
|
||||
n_tenants: int,
|
||||
pgbench_scale: int,
|
||||
duration: int,
|
||||
@@ -45,7 +47,7 @@ def test_basebackup_with_high_slru_count(
|
||||
max_file_descriptors = 500000
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
|
||||
f"get_vectored_impl='vectored'; validate_vectored_get=false"
|
||||
f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false"
|
||||
)
|
||||
params.update(
|
||||
{
|
||||
|
||||
@@ -135,7 +135,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
|
||||
# To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
|
||||
# this memory estimate can be revised far downwards to something that doesn't scale
|
||||
# linearly with the layer sizes.
|
||||
MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5
|
||||
MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25
|
||||
|
||||
# If we find that compaction is using more memory, this may indicate a regression
|
||||
assert compaction_mapped_rss < MEMORY_ESTIMATE
|
||||
|
||||
@@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.66"
|
||||
version = "0.10.64"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
|
||||
checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
|
||||
dependencies = [
|
||||
"bitflags 2.6.0",
|
||||
"cfg-if",
|
||||
@@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.103"
|
||||
version = "0.9.102"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
|
||||
checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
|
||||
@@ -3,16 +3,9 @@ import asyncio
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from werkzeug.wrappers.request import Request
|
||||
from werkzeug.wrappers.response import Response
|
||||
|
||||
|
||||
def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
|
||||
"""
|
||||
A relatively low level test of reconfiguring a compute's pageserver at runtime. Usually this
|
||||
is all done via the storage controller, but this test will disable the storage controller's compute
|
||||
notifications, and instead update endpoints directly.
|
||||
"""
|
||||
def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
||||
num_connections = 3
|
||||
|
||||
neon_env_builder.num_pageservers = 2
|
||||
@@ -21,24 +14,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
neon_env_builder.control_plane_compute_hook_api = (
|
||||
f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach"
|
||||
)
|
||||
|
||||
def ignore_notify(request: Request):
|
||||
# This test does direct updates to compute configuration: disable the storage controller's notification
|
||||
log.info(f"Ignoring storage controller compute notification: {request.json}")
|
||||
return Response(status=200)
|
||||
|
||||
make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(
|
||||
ignore_notify
|
||||
)
|
||||
|
||||
env.neon_cli.create_branch("test_change_pageserver")
|
||||
endpoint = env.endpoints.create_start("test_change_pageserver")
|
||||
|
||||
# Put this tenant into a dual-attached state
|
||||
assert env.get_tenant_pageserver(env.initial_tenant) == env.pageservers[0]
|
||||
alt_pageserver_id = env.pageservers[1].id
|
||||
env.pageservers[1].tenant_attach(env.initial_tenant)
|
||||
|
||||
@@ -94,7 +72,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
|
||||
env.pageservers[
|
||||
0
|
||||
].stop() # Stop the old pageserver just to make sure we're reading from the new one
|
||||
env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
|
||||
|
||||
execute("SELECT count(*) FROM foo")
|
||||
assert fetchone() == (100000,)
|
||||
@@ -105,10 +82,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
|
||||
#
|
||||
# Since we're dual-attached, need to tip-off storage controller to treat the one we're
|
||||
# about to start as the attached pageserver
|
||||
env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
|
||||
env.pageservers[0].start()
|
||||
env.pageservers[1].stop()
|
||||
env.storage_controller.node_configure(env.pageservers[1].id, {"availability": "Offline"})
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
|
||||
|
||||
@@ -116,9 +92,10 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
|
||||
assert fetchone() == (100000,)
|
||||
|
||||
env.pageservers[0].stop()
|
||||
# Since we're dual-attached, need to tip-off storage controller to treat the one we're
|
||||
# about to start as the attached pageserver
|
||||
env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
|
||||
env.pageservers[1].start()
|
||||
env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
# Test a (former) bug where a child process spins without updating its connection string
|
||||
# by executing a query separately. This query will hang until we issue the reconfigure.
|
||||
|
||||
@@ -21,10 +21,6 @@ from fixtures.utils import human_bytes, wait_until
|
||||
|
||||
GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
|
||||
|
||||
# access times in the pageserver are stored at a very low resolution: to generate meaningfully different
|
||||
# values, tests must inject sleeps
|
||||
ATIME_RESOLUTION = 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("config_level_override", [None, 400])
|
||||
def test_min_resident_size_override_handling(
|
||||
@@ -71,11 +67,14 @@ def test_min_resident_size_override_handling(
|
||||
|
||||
@enum.unique
|
||||
class EvictionOrder(str, enum.Enum):
|
||||
ABSOLUTE_ORDER = "absolute"
|
||||
RELATIVE_ORDER_EQUAL = "relative_equal"
|
||||
RELATIVE_ORDER_SPARE = "relative_spare"
|
||||
|
||||
def config(self) -> Dict[str, Any]:
|
||||
if self == EvictionOrder.RELATIVE_ORDER_EQUAL:
|
||||
if self == EvictionOrder.ABSOLUTE_ORDER:
|
||||
return {"type": "AbsoluteAccessed"}
|
||||
elif self == EvictionOrder.RELATIVE_ORDER_EQUAL:
|
||||
return {
|
||||
"type": "RelativeAccessed",
|
||||
"args": {"highest_layer_count_loses_first": False},
|
||||
@@ -385,7 +384,7 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[EvictionOrder.RELATIVE_ORDER_EQUAL],
|
||||
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
|
||||
)
|
||||
def test_pageserver_evicts_until_pressure_is_relieved(
|
||||
eviction_env: EvictionEnv, order: EvictionOrder
|
||||
@@ -419,7 +418,7 @@ def test_pageserver_evicts_until_pressure_is_relieved(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[EvictionOrder.RELATIVE_ORDER_EQUAL],
|
||||
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
|
||||
)
|
||||
def test_pageserver_respects_overridden_resident_size(
|
||||
eviction_env: EvictionEnv, order: EvictionOrder
|
||||
@@ -496,7 +495,7 @@ def test_pageserver_respects_overridden_resident_size(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[EvictionOrder.RELATIVE_ORDER_EQUAL],
|
||||
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
|
||||
)
|
||||
def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder):
|
||||
"""
|
||||
@@ -527,6 +526,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[
|
||||
EvictionOrder.ABSOLUTE_ORDER,
|
||||
EvictionOrder.RELATIVE_ORDER_EQUAL,
|
||||
EvictionOrder.RELATIVE_ORDER_SPARE,
|
||||
],
|
||||
@@ -550,7 +550,6 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
|
||||
(tenant_id, timeline_id) = warm
|
||||
|
||||
# make picked tenant more recently used than the other one
|
||||
time.sleep(ATIME_RESOLUTION)
|
||||
env.warm_up_tenant(tenant_id)
|
||||
|
||||
# Build up enough pressure to require evictions from both tenants,
|
||||
@@ -573,38 +572,63 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
|
||||
later_tenant_usage < du_by_timeline[tenant]
|
||||
), "all tenants should have lost some layers"
|
||||
|
||||
# with relative order what matters is the amount of layers, with a
|
||||
# fudge factor of whether the eviction bothers tenants with highest
|
||||
# layer count the most. last accessed times between tenants does not
|
||||
# matter.
|
||||
assert order in [EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE]
|
||||
layers_now = env.count_layers_per_tenant(env.pageserver)
|
||||
warm_size = later_du_by_timeline[warm]
|
||||
cold_size = later_du_by_timeline[cold]
|
||||
|
||||
expected_ratio = later_total_on_disk / total_on_disk
|
||||
log.info(
|
||||
f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio"
|
||||
)
|
||||
if order == EvictionOrder.ABSOLUTE_ORDER:
|
||||
# bounds for warmed_size
|
||||
warm_lower = 0.5 * du_by_timeline[warm]
|
||||
|
||||
for tenant_id, original_count in tenant_layers.items():
|
||||
count_now = layers_now[tenant_id]
|
||||
ratio = count_now / original_count
|
||||
abs_diff = abs(ratio - expected_ratio)
|
||||
assert original_count > count_now
|
||||
# We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
|
||||
# So, check for up to 3 here.
|
||||
warm_upper = warm_lower + 3 * env.layer_size
|
||||
|
||||
expectation = 0.06
|
||||
cold_upper = 2 * env.layer_size
|
||||
log.info(f"tenants: warm={warm[0]}, cold={cold[0]}")
|
||||
log.info(
|
||||
f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
|
||||
f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
|
||||
)
|
||||
# in this test case both relative_spare and relative_equal produce
|
||||
# the same outcomes; this must be a quantization effect of similar
|
||||
# sizes (-s4 and -s6) and small (5MB) layer size.
|
||||
# for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
|
||||
assert abs_diff < expectation
|
||||
log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
|
||||
|
||||
assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
|
||||
assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
|
||||
|
||||
assert (
|
||||
cold_size < cold_upper
|
||||
), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
|
||||
else:
|
||||
# with relative order what matters is the amount of layers, with a
|
||||
# fudge factor of whether the eviction bothers tenants with highest
|
||||
# layer count the most. last accessed times between tenants does not
|
||||
# matter.
|
||||
layers_now = env.count_layers_per_tenant(env.pageserver)
|
||||
|
||||
expected_ratio = later_total_on_disk / total_on_disk
|
||||
log.info(
|
||||
f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio"
|
||||
)
|
||||
|
||||
for tenant_id, original_count in tenant_layers.items():
|
||||
count_now = layers_now[tenant_id]
|
||||
ratio = count_now / original_count
|
||||
abs_diff = abs(ratio - expected_ratio)
|
||||
assert original_count > count_now
|
||||
|
||||
expectation = 0.06
|
||||
log.info(
|
||||
f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
|
||||
)
|
||||
# in this test case both relative_spare and relative_equal produce
|
||||
# the same outcomes; this must be a quantization effect of similar
|
||||
# sizes (-s4 and -s6) and small (5MB) layer size.
|
||||
# for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
|
||||
assert abs_diff < expectation
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[
|
||||
EvictionOrder.ABSOLUTE_ORDER,
|
||||
EvictionOrder.RELATIVE_ORDER_EQUAL,
|
||||
EvictionOrder.RELATIVE_ORDER_SPARE,
|
||||
],
|
||||
@@ -627,10 +651,6 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or
|
||||
for scale in [1, 1, 1, 4]:
|
||||
timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale))
|
||||
|
||||
# Eviction times are stored at a low resolution. We must ensure that the time between
|
||||
# tenants is long enough for the pageserver to distinguish them.
|
||||
time.sleep(ATIME_RESOLUTION)
|
||||
|
||||
env.neon_cli.safekeeper_stop()
|
||||
|
||||
for (tenant_id, timeline_id), scale in timelines:
|
||||
@@ -660,7 +680,14 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or
|
||||
), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order"
|
||||
log.info(f"{ratios}")
|
||||
|
||||
if order == EvictionOrder.RELATIVE_ORDER_EQUAL:
|
||||
if order == EvictionOrder.ABSOLUTE_ORDER:
|
||||
# first tenant loses most
|
||||
assert ratios[0] <= ratios[1], "first should lose the most"
|
||||
assert ratios[1] < ratios[2], "second should lose some"
|
||||
assert ratios[1] < 1.0
|
||||
assert ratios[2] <= ratios[3], "third might not lose"
|
||||
assert ratios[3] == 1.0, "tenant created last does not lose"
|
||||
elif order == EvictionOrder.RELATIVE_ORDER_EQUAL:
|
||||
assert all([x for x in ratios if x < 1.0]), "all tenants lose layers"
|
||||
elif order == EvictionOrder.RELATIVE_ORDER_SPARE:
|
||||
# with different layer sizes and pg versions, there are different combinations
|
||||
@@ -723,7 +750,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
|
||||
"type": "Failure",
|
||||
"mocked_error": "EIO",
|
||||
},
|
||||
eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
|
||||
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
|
||||
)
|
||||
|
||||
env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
|
||||
@@ -757,7 +784,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
|
||||
# This avoids accounting for metadata files & tenant conf in the tests.
|
||||
"name_filter": ".*__.*",
|
||||
},
|
||||
eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
|
||||
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
|
||||
)
|
||||
|
||||
wait_until(
|
||||
@@ -810,7 +837,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
|
||||
# This avoids accounting for metadata files & tenant conf in the tests.
|
||||
"name_filter": ".*__.*",
|
||||
},
|
||||
eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
|
||||
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
|
||||
)
|
||||
|
||||
wait_until(
|
||||
|
||||
@@ -89,7 +89,7 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
|
||||
)
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute("create extension neon")
|
||||
cur.execute("create extension neon version '1.4'")
|
||||
cur.execute(
|
||||
"create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
|
||||
)
|
||||
|
||||
@@ -24,7 +24,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
|
||||
# IMPORTANT:
|
||||
# If the version has changed, the test should be updated.
|
||||
# Ensure that the default version is also updated in the neon.control file
|
||||
assert cur.fetchone() == ("1.4",)
|
||||
assert cur.fetchone() == ("1.3",)
|
||||
cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
|
||||
res = cur.fetchall()
|
||||
log.info(res)
|
||||
@@ -48,10 +48,10 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
|
||||
# IMPORTANT:
|
||||
# If the version has changed, the test should be updated.
|
||||
# Ensure that the default version is also updated in the neon.control file
|
||||
assert cur.fetchone() == ("1.4",)
|
||||
assert cur.fetchone() == ("1.3",)
|
||||
cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
|
||||
all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"]
|
||||
current_version = "1.4"
|
||||
current_version = "1.3"
|
||||
for idx, begin_version in enumerate(all_versions):
|
||||
for target_version in all_versions[idx + 1 :]:
|
||||
if current_version != begin_version:
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
def test_oid_overflow(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("CREATE EXTENSION neon_test_utils")
|
||||
|
||||
cur.execute("CREATE TABLE t1(x integer)")
|
||||
cur.execute("INSERT INTO t1 values (1)")
|
||||
cur.execute("CREATE TABLE t2(x integer)")
|
||||
cur.execute("INSERT INTO t2 values (2)")
|
||||
|
||||
cur.execute("SELECT x from t1")
|
||||
assert cur.fetchone() == (1,)
|
||||
cur.execute("SELECT x from t2")
|
||||
assert cur.fetchone() == (2,)
|
||||
|
||||
cur.execute("VACUUM FULL t1")
|
||||
cur.execute("VACUUM FULL t1")
|
||||
cur.execute("vacuum pg_class")
|
||||
cur.execute("SELECT relfilenode FROM pg_class where relname='t1'")
|
||||
oid = cur.fetchall()[0][0]
|
||||
log.info(f"t1.relfilenode={oid}")
|
||||
|
||||
cur.execute("set statement_timeout=0")
|
||||
cur.execute(f"select test_consume_oids({oid-1})")
|
||||
cur.execute("VACUUM FULL t2")
|
||||
|
||||
cur.execute("SELECT relfilenode FROM pg_class where relname='t2'")
|
||||
oid = cur.fetchall()[0][0]
|
||||
log.info(f"t2.relfilenode={oid}")
|
||||
|
||||
cur.execute("SELECT clear_buffer_cache()")
|
||||
|
||||
cur.execute("SELECT x from t1")
|
||||
assert cur.fetchone() == (1,)
|
||||
cur.execute("SELECT x from t2")
|
||||
assert cur.fetchone() == (2,)
|
||||
@@ -1,5 +1,8 @@
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import toml
|
||||
from fixtures.common_types import Lsn, TenantId, TimelineId
|
||||
from fixtures.neon_fixtures import (
|
||||
DEFAULT_BRANCH_NAME,
|
||||
@@ -10,6 +13,67 @@ from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
|
||||
"""
|
||||
NB: The neon_local doesn't use `--init` mode anymore, but our production
|
||||
deployment still does => https://github.com/neondatabase/aws/pull/1322
|
||||
"""
|
||||
workdir = neon_simple_env.pageserver.workdir
|
||||
pageserver_config = workdir / "pageserver.toml"
|
||||
pageserver_bin = neon_binpath / "pageserver"
|
||||
|
||||
def run_pageserver(args):
|
||||
return subprocess.run(
|
||||
[str(pageserver_bin), "-D", str(workdir), *args],
|
||||
check=False,
|
||||
universal_newlines=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
|
||||
neon_simple_env.pageserver.stop()
|
||||
|
||||
with open(neon_simple_env.pageserver.config_toml_path, "r") as f:
|
||||
ps_config = toml.load(f)
|
||||
|
||||
required_config_keys = [
|
||||
"pg_distrib_dir",
|
||||
"listen_pg_addr",
|
||||
"listen_http_addr",
|
||||
"pg_auth_type",
|
||||
"http_auth_type",
|
||||
# TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
|
||||
# "tenant_config",
|
||||
]
|
||||
required_config_overrides = [
|
||||
f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
|
||||
]
|
||||
|
||||
pageserver_config.unlink()
|
||||
|
||||
bad_init = run_pageserver(["--init", *required_config_overrides])
|
||||
assert (
|
||||
bad_init.returncode == 1
|
||||
), "pageserver should not be able to init new config without the node id"
|
||||
assert 'missing config value "id"' in bad_init.stderr
|
||||
assert not pageserver_config.exists(), "config file should not be created after init error"
|
||||
|
||||
good_init_cmd = [
|
||||
"--init",
|
||||
f"--config-override=id={ps_config['id']}",
|
||||
*required_config_overrides,
|
||||
]
|
||||
completed_init = run_pageserver(good_init_cmd)
|
||||
assert (
|
||||
completed_init.returncode == 0
|
||||
), "pageserver should be able to create a new config with the node id given"
|
||||
assert pageserver_config.exists(), "config file should be created successfully"
|
||||
|
||||
bad_reinit = run_pageserver(good_init_cmd)
|
||||
assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists"
|
||||
assert "config file already exists" in bad_reinit.stderr
|
||||
|
||||
|
||||
def check_client(env: NeonEnv, client: PageserverHttpClient):
|
||||
pg_version = env.pg_version
|
||||
initial_tenant = env.initial_tenant
|
||||
|
||||
@@ -53,6 +53,25 @@ def test_proxy_select_1(static_proxy: NeonProxy):
|
||||
assert out[0][0] == 42
|
||||
|
||||
|
||||
def test_proxy_server_params(static_proxy: NeonProxy):
|
||||
"""
|
||||
Test that server params are passing through to postgres
|
||||
"""
|
||||
|
||||
out = static_proxy.safe_psql(
|
||||
"select to_json('0 seconds'::interval)", options="-c intervalstyle=iso_8601"
|
||||
)
|
||||
assert out[0][0] == "PT0S"
|
||||
out = static_proxy.safe_psql(
|
||||
"select to_json('0 seconds'::interval)", options="-c intervalstyle=sql_standard"
|
||||
)
|
||||
assert out[0][0] == "0"
|
||||
out = static_proxy.safe_psql(
|
||||
"select to_json('0 seconds'::interval)", options="-c intervalstyle=postgres"
|
||||
)
|
||||
assert out[0][0] == "00:00:00"
|
||||
|
||||
|
||||
def test_password_hack(static_proxy: NeonProxy):
|
||||
"""
|
||||
Check the PasswordHack auth flow: an alternative to SCRAM auth for
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import os
|
||||
import pprint
|
||||
import shutil
|
||||
import threading
|
||||
import time
|
||||
@@ -209,17 +208,9 @@ def test_scrubber_physical_gc_ancestors(
|
||||
new_shard_count = 4
|
||||
assert shard_count is None or new_shard_count > shard_count
|
||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
||||
env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately
|
||||
|
||||
# Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which
|
||||
# compacts, and we only want to do tha explicitly later in the test.
|
||||
workload.write_rows(100, upload=False)
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
|
||||
ps.http_client().timeline_checkpoint(
|
||||
shard, timeline_id, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
# Make sure child shards have some layers
|
||||
workload.write_rows(100)
|
||||
|
||||
# Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once
|
||||
# a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even
|
||||
@@ -382,76 +373,3 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
|
||||
assert gc_output["ancestor_layers_deleted"] > 0
|
||||
assert gc_output["remote_storage_errors"] == 0
|
||||
assert gc_output["controller_api_errors"] == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shard_count", [None, 4])
|
||||
def test_scrubber_scan_pageserver_metadata(
|
||||
neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]
|
||||
):
|
||||
"""
|
||||
Create some layers. Delete an object listed in index. Run scrubber and see if it detects the defect.
|
||||
"""
|
||||
|
||||
# Use s3_storage so we could test out scrubber.
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1
|
||||
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
|
||||
|
||||
# Create some layers.
|
||||
|
||||
workload = Workload(env, env.initial_tenant, env.initial_timeline)
|
||||
workload.init()
|
||||
|
||||
for _ in range(3):
|
||||
workload.write_rows(128)
|
||||
|
||||
for pageserver in env.pageservers:
|
||||
pageserver.stop()
|
||||
pageserver.start()
|
||||
|
||||
for _ in range(3):
|
||||
workload.write_rows(128)
|
||||
|
||||
# Get the latest index for a particular timeline.
|
||||
|
||||
tenant_shard_id = TenantShardId(env.initial_tenant, 0, shard_count if shard_count else 0)
|
||||
|
||||
assert isinstance(env.pageserver_remote_storage, S3Storage)
|
||||
timeline_path = env.pageserver_remote_storage.timeline_path(
|
||||
tenant_shard_id, env.initial_timeline
|
||||
)
|
||||
|
||||
client = env.pageserver_remote_storage.client
|
||||
bucket = env.pageserver_remote_storage.bucket_name
|
||||
objects = client.list_objects_v2(Bucket=bucket, Prefix=f"{timeline_path}/", Delimiter="").get(
|
||||
"Contents", []
|
||||
)
|
||||
keys = [obj["Key"] for obj in objects]
|
||||
index_keys = list(filter(lambda s: s.startswith(f"{timeline_path}/index_part"), keys))
|
||||
assert len(index_keys) > 0
|
||||
|
||||
latest_index_key = env.pageserver_remote_storage.get_latest_index_key(index_keys)
|
||||
log.info(f"{latest_index_key=}")
|
||||
|
||||
index = env.pageserver_remote_storage.download_index_part(latest_index_key)
|
||||
|
||||
assert len(index.layer_metadata) > 0
|
||||
it = iter(index.layer_metadata.items())
|
||||
|
||||
scan_summary = env.storage_scrubber.scan_metadata()
|
||||
assert not scan_summary["with_warnings"]
|
||||
assert not scan_summary["with_errors"]
|
||||
|
||||
# Delete a layer file that is listed in the index.
|
||||
layer, metadata = next(it)
|
||||
log.info(f"Deleting {timeline_path}/{layer.to_str()}")
|
||||
delete_response = client.delete_object(
|
||||
Bucket=bucket,
|
||||
Key=f"{timeline_path}/{layer.to_str()}-{metadata.generation:08x}",
|
||||
)
|
||||
log.info(f"delete response: {delete_response}")
|
||||
|
||||
# Check scan summary. Expect it to be a L0 layer so only emit warnings.
|
||||
scan_summary = env.storage_scrubber.scan_metadata()
|
||||
log.info(f"{pprint.pformat(scan_summary)}")
|
||||
assert len(scan_summary["with_warnings"]) > 0
|
||||
|
||||
@@ -52,8 +52,8 @@ def test_threshold_based_eviction(
|
||||
"kind": "NoEviction"
|
||||
}
|
||||
|
||||
eviction_threshold = 10
|
||||
eviction_period = 2
|
||||
eviction_threshold = 5
|
||||
eviction_period = 1
|
||||
ps_http.set_tenant_config(
|
||||
tenant_id,
|
||||
{
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: dbd0e6428b...ad73770c44
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 035b73a9c5...4874c8e52e
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: b39f316137...b810fdfcbb
6
vendor/revisions.json
vendored
6
vendor/revisions.json
vendored
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"v16": ["16.3", "b39f316137fdd29e2da15d2af2fdd1cfd18163be"],
|
||||
"v15": ["15.7", "035b73a9c5998f9a0ef35cc8df1bae680bf770fc"],
|
||||
"v14": ["14.12", "dbd0e6428b9274d72a10ac29bd3e3162faf109d4"]
|
||||
"v16": ["16.3", "b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2"],
|
||||
"v15": ["15.7", "4874c8e52ed349a9f8290bbdcd91eb92677a5d24"],
|
||||
"v14": ["14.12", "ad73770c446ea361f43e4f0404798b7e5e7a62d8"]
|
||||
}
|
||||
|
||||
@@ -236,7 +236,6 @@ files:
|
||||
query: |
|
||||
select sum(pg_database_size(datname)) as total from pg_database;
|
||||
|
||||
# DEPRECATED
|
||||
- metric_name: lfc_approximate_working_set_size
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
@@ -245,20 +244,6 @@ files:
|
||||
query: |
|
||||
select neon.approximate_working_set_size(false) as approximate_working_set_size;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration]
|
||||
values: [size]
|
||||
# NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
|
||||
# of durations in a pretty-printed form.
|
||||
query: |
|
||||
select
|
||||
x as duration,
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
|
||||
from
|
||||
(values ('5m'),('15m'),('1h')) as t (x);
|
||||
|
||||
- metric_name: current_lsn
|
||||
type: gauge
|
||||
help: 'Current LSN of the database'
|
||||
@@ -392,19 +377,13 @@ files:
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
- metric_name: lfc_approximate_working_set_size
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration_seconds]
|
||||
values: [size]
|
||||
# NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
|
||||
# size looking back 1..60 minutes, labeled with the number of minutes.
|
||||
key_labels:
|
||||
values: [approximate_working_set_size]
|
||||
query: |
|
||||
select
|
||||
x::text as duration_seconds,
|
||||
neon.approximate_working_set_size_seconds(x) as size
|
||||
from
|
||||
(select generate_series * 60 as x from generate_series(1, 60));
|
||||
select neon.approximate_working_set_size(false) as approximate_working_set_size;
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
|
||||
Reference in New Issue
Block a user