Compare commits

..

44 Commits

Author SHA1 Message Date
Christian Schwarz
6037b8e6c1 prove hypothesis (inefficient fix) 2024-09-15 16:08:11 +01:00
Christian Schwarz
c108872a36 hypothesis 2024-09-15 15:19:04 +01:00
Christian Schwarz
588e6dd1ef include lsn in error msg 2024-09-15 14:01:04 +01:00
Christian Schwarz
2fafe47e09 better error message 2024-09-15 13:45:10 +01:00
Christian Schwarz
e6e72e8b68 log by invocation 2024-09-15 13:38:07 +01:00
Christian Schwarz
2a0695e01a log always, but at debug 2024-09-15 13:28:47 +01:00
Christian Schwarz
af6c168265 debug-log the buggy key 2024-09-15 13:12:25 +01:00
Christian Schwarz
1b3209497f image_layer: don't do .rev(), it's confusing and unlike what we did before 2024-09-15 12:58:13 +01:00
Christian Schwarz
9f97523d0d dump full missing keyspace 2024-09-15 12:40:41 +01:00
Christian Schwarz
33196c90bc todo! 2024-09-15 12:32:17 +01:00
Christian Schwarz
c72c5df922 make sure get_cached_lsn returns what it returned before 2024-09-15 12:20:58 +01:00
Christian Schwarz
4c7599e8df Revert "address the "This is silly" comment"
This reverts commit c736f9d6ef.
2024-09-15 12:18:26 +01:00
Christian Schwarz
4b2b0a24da Revert "WIP better typed"
This reverts commit ef5a95010b.
2024-09-15 12:18:21 +01:00
Christian Schwarz
ef5a95010b WIP better typed 2024-09-15 12:18:15 +01:00
Christian Schwarz
c736f9d6ef address the "This is silly" comment 2024-09-14 15:38:46 +00:00
Christian Schwarz
adc798b59e explicitly pass will_init 2024-09-14 15:15:48 +00:00
Christian Schwarz
f0668a7a4d cargo fmt 2024-09-14 15:15:48 +00:00
Christian Schwarz
6d73642a93 hyptohesis of what's wrong 2024-09-14 14:26:52 +00:00
Christian Schwarz
9012a18fa1 Revert "WIP"
This reverts commit 709a6ad29b.
2024-09-14 14:18:39 +00:00
Christian Schwarz
a6a7550bb4 add some debugging that didn't lead anywhere 2024-09-14 14:18:02 +00:00
Christian Schwarz
10556f25df fix double .remove() on InMemoryLayer read error (not the bug) 2024-09-14 13:41:36 +00:00
Christian Schwarz
f54cf567ff implement io concurrency control (serial / parallel). bug still happens 2024-09-14 13:22:28 +00:00
Christian Schwarz
4303914681 Revert "rule out empty keyspace bug" (it wasn't the bug)
This reverts commit 539225d792.
2024-09-14 13:02:20 +00:00
Christian Schwarz
539225d792 rule out empty keyspace bug 2024-09-14 13:02:06 +00:00
Christian Schwarz
c118736c9c draw-timeline-dir: ability to draw line 2024-09-14 12:48:23 +00:00
Christian Schwarz
3f85246a42 4096 queuedepth in pagebench get-page-latest-lsn 2024-09-14 12:06:57 +00:00
Christian Schwarz
709a6ad29b WIP 2024-09-14 12:05:42 +00:00
Vlad Lazar
3640824553 fixup: incorrect assumption on img layer need 2024-09-12 23:04:58 +01:00
Vlad Lazar
fea1f34f6a fixup: serialize again in image layer 2024-09-12 23:04:10 +01:00
Vlad Lazar
5d40d1ccdd fixup: order of waiters in delta layer 2024-09-12 23:04:01 +01:00
Vlad Lazar
b2cb10590e fixup: deserialize shenanigans 2024-09-12 20:00:06 +01:00
Vlad Lazar
2923fd2a5b fixup: remove stale import 2024-09-12 19:25:46 +01:00
Vlad Lazar
2a5336b9ab fixup image deserialization 2024-09-12 19:24:41 +01:00
Christian Schwarz
6f20726610 Merge remote-tracking branch 'origin/hackaneon/lisbon24/superscalar-page_service--problame/evaluate-debouncer' into hackaneon/lisbon24/superscalar-page_service 2024-09-12 17:47:16 +00:00
Christian Schwarz
29f741e1e9 debounce: actually issue vectored get 2024-09-12 17:46:30 +00:00
Vlad Lazar
2b37a40079 Materialize future ios 2024-09-12 18:25:17 +01:00
Vlad Lazar
af2b65a2fb Rework issuing of IOs on read path 2024-09-12 16:42:35 +01:00
Christian Schwarz
5d194c7824 debounce: bounce if shard or effective request_lsn differ 2024-09-12 14:20:32 +00:00
Christian Schwarz
ac2702afd3 deboucner: move decoding into debounce loop 2024-09-12 10:58:09 +00:00
Christian Schwarz
88fd46d795 sketch interface 2024-09-12 11:35:00 +01:00
Christian Schwarz
2d6763882e pagebench: fake queue depth of 10 2024-09-12 11:35:00 +01:00
Christian Schwarz
c0c23cde72 debouncer 2024-09-12 11:35:00 +01:00
Christian Schwarz
942bc9544b fixup 2024-09-11 20:04:39 +00:00
Christian Schwarz
02b7cdb305 HACK: instrument page_service to count nonblocking consecutive getpage requests 2024-09-11 19:25:19 +01:00
229 changed files with 4109 additions and 6509 deletions

View File

@@ -62,7 +62,7 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16 17; do
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
@@ -83,10 +83,6 @@ jobs:
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
- name: Set pg 17 revision for caching
id: pg_v17_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
# Set some environment variables used by all the steps.
#
# CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
@@ -140,13 +136,6 @@ jobs:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Cache postgres v17 build
id: cache_pg_17
uses: actions/cache@v4
with:
path: pg_install/v17
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: mold -run make postgres-v14 -j$(nproc)
@@ -159,10 +148,6 @@ jobs:
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: mold -run make postgres-v16 -j$(nproc)
- name: Build postgres v17
if: steps.cache_pg_17.outputs.cache-hit != 'true'
run: mold -run make postgres-v17 -j$(nproc)
- name: Build neon extensions
run: mold -run make neon-pg-ext -j$(nproc)
@@ -225,7 +210,7 @@ jobs:
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
export LD_LIBRARY_PATH
#nextest does not yet support running doctests

View File

@@ -52,5 +52,5 @@ jobs:
for image in ${images}; do
docker buildx imagetools create \
-t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
neondatabase/${image}:${{ inputs.image_tag }}
neondatabase/${image}:${{ inputs.image_tag }}
done

View File

@@ -54,8 +54,8 @@ jobs:
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
# Need `fetch-depth: 0` to count the number of commits in the branch
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -159,10 +159,6 @@ jobs:
# This will catch compiler & clippy warnings in all feature combinations.
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
# NB: keep clippy args in sync with ./run_clippy.sh
#
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
# time just for that, so skip "clippy --release".
- run: |
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
@@ -172,6 +168,8 @@ jobs:
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug)
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Run cargo clippy (release)
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation
run: cargo doc --workspace --no-deps --document-private-items
@@ -213,7 +211,7 @@ jobs:
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }}
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
secrets: inherit
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -359,7 +357,6 @@ jobs:
})
coverage-report:
if: ${{ !startsWith(github.ref_name, 'release') }}
needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
runs-on: [ self-hosted, small ]
container:
@@ -376,8 +373,8 @@ jobs:
coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
steps:
# Need `fetch-depth: 0` for differential coverage (to get diff between two commits)
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
@@ -478,9 +475,11 @@ jobs:
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
steps:
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/setup-buildx-action@v3
@@ -549,15 +548,17 @@ jobs:
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16, v17 ]
version: [ v14, v15, v16 ]
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
steps:
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/setup-buildx-action@v3
@@ -626,7 +627,7 @@ jobs:
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
if: matrix.version == 'v17'
if: matrix.version == 'v16'
uses: docker/build-push-action@v6
with:
target: compute-tools-image
@@ -648,7 +649,7 @@ jobs:
strategy:
matrix:
version: [ v14, v15, v16, v17 ]
version: [ v14, v15, v16 ]
steps:
- uses: docker/login-action@v3
@@ -670,7 +671,7 @@ jobs:
neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
- name: Create multi-arch compute-tools image
if: matrix.version == 'v17'
if: matrix.version == 'v16'
run: |
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
@@ -688,7 +689,7 @@ jobs:
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
- name: Push multi-arch compute-tools image to ECR
if: matrix.version == 'v17'
if: matrix.version == 'v16'
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
@@ -699,12 +700,15 @@ jobs:
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16, v17 ]
version: [ v14, v15, v16 ]
env:
VM_BUILDER_VERSION: v0.29.3
steps:
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Downloading vm-builder
run: |
@@ -744,7 +748,10 @@ jobs:
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
steps:
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/login-action@v3
@@ -791,7 +798,7 @@ jobs:
runs-on: ubuntu-22.04
env:
VERSIONS: v14 v15 v16 v17
VERSIONS: v14 v15 v16
steps:
- uses: docker/login-action@v3
@@ -832,7 +839,7 @@ jobs:
done
done
docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
- name: Login to prod ECR
uses: docker/login-action@v3
@@ -845,7 +852,7 @@ jobs:
- name: Copy all images to prod ECR
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
run: |
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
done
@@ -857,7 +864,7 @@ jobs:
with:
client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
image_tag: ${{ needs.tag.outputs.build-tag }}
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -869,7 +876,7 @@ jobs:
with:
client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
image_tag: ${{ needs.tag.outputs.build-tag }}
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -950,7 +957,6 @@ jobs:
deploy:
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
runs-on: [ self-hosted, small ]
@@ -965,12 +971,15 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16 17; do
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Trigger deploy workflow
env:
@@ -1049,8 +1058,7 @@ jobs:
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
promote-compatibility-data:
needs: [ deploy ]
# `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
if: github.ref_name == 'release' && !failure() && !cancelled()
if: github.ref_name == 'release'
runs-on: ubuntu-22.04
steps:
@@ -1109,7 +1117,6 @@ jobs:
files_to_promote+=("s3://${BUCKET}/${s3_key}")
# TODO Add v17
for pg_version in v14 v15 v16; do
# We run less tests for debug builds, so we don't need to promote them
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then

View File

@@ -72,10 +72,6 @@ jobs:
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
- name: Set pg 17 revision for caching
id: pg_v17_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
@@ -97,13 +93,6 @@ jobs:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v17 build
id: cache_pg_17
uses: actions/cache@v4
with:
path: pg_install/v17
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Set extra env for macOS
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
@@ -131,10 +120,6 @@ jobs:
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: make postgres-v16 -j$(sysctl -n hw.ncpu)
- name: Build postgres v17
if: steps.cache_pg_17.outputs.cache-hit != 'true'
run: make postgres-v17 -j$(sysctl -n hw.ncpu)
- name: Build neon extensions
run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
@@ -181,7 +166,7 @@ jobs:
run: make walproposer-lib -j$(nproc)
- name: Produce the build stats
run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
- name: Upload the build stats
id: upload-stats

View File

@@ -34,8 +34,8 @@ jobs:
build-tag: ${{ steps.build-tag.outputs.tag }}
steps:
# Need `fetch-depth: 0` to count the number of commits in the branch
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

4
.gitmodules vendored
View File

@@ -10,7 +10,3 @@
path = vendor/postgres-v16
url = https://github.com/neondatabase/postgres.git
branch = REL_16_STABLE_neon
[submodule "vendor/postgres-v17"]
path = vendor/postgres-v17
url = https://github.com/neondatabase/postgres.git
branch = REL_17_STABLE_neon

255
Cargo.lock generated
View File

@@ -1189,9 +1189,9 @@ dependencies = [
[[package]]
name = "comfy-table"
version = "7.1.1"
version = "6.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d"
dependencies = [
"crossterm",
"strum",
@@ -1209,6 +1209,7 @@ dependencies = [
"remote_storage",
"serde",
"serde_json",
"serde_with",
"utils",
]
@@ -1217,6 +1218,7 @@ name = "compute_tools"
version = "0.1.0"
dependencies = [
"anyhow",
"async-compression",
"bytes",
"cfg-if",
"chrono",
@@ -1235,6 +1237,7 @@ dependencies = [
"reqwest 0.12.4",
"rlimit",
"rust-ini",
"serde",
"serde_json",
"signal-hook",
"tar",
@@ -1243,6 +1246,7 @@ dependencies = [
"tokio-postgres",
"tokio-stream",
"tokio-util",
"toml_edit 0.19.10",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
@@ -1313,9 +1317,12 @@ dependencies = [
name = "consumption_metrics"
version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"rand 0.8.5",
"serde",
"serde_with",
"utils",
]
[[package]]
@@ -1327,7 +1334,9 @@ dependencies = [
"clap",
"comfy-table",
"compute_api",
"futures",
"git-version",
"hex",
"humantime",
"humantime-serde",
"hyper 0.14.26",
@@ -1335,6 +1344,7 @@ dependencies = [
"once_cell",
"pageserver_api",
"pageserver_client",
"postgres",
"postgres_backend",
"postgres_connection",
"regex",
@@ -1343,13 +1353,15 @@ dependencies = [
"scopeguard",
"serde",
"serde_json",
"serde_with",
"storage_broker",
"tar",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-util",
"toml",
"toml_edit",
"toml 0.7.4",
"toml_edit 0.19.10",
"tracing",
"url",
"utils",
@@ -1473,22 +1485,25 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
[[package]]
name = "crossterm"
version = "0.27.0"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67"
dependencies = [
"bitflags 2.4.1",
"bitflags 1.3.2",
"crossterm_winapi",
"libc",
"mio",
"parking_lot 0.12.1",
"signal-hook",
"signal-hook-mio",
"winapi",
]
[[package]]
name = "crossterm_winapi"
version = "0.9.1"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
dependencies = [
"winapi",
]
@@ -1651,6 +1666,7 @@ dependencies = [
"hex",
"parking_lot 0.12.1",
"rand 0.8.5",
"scopeguard",
"smallvec",
"tracing",
"utils",
@@ -2220,22 +2236,24 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
[[package]]
name = "git-version"
version = "0.3.9"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19"
checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899"
dependencies = [
"git-version-macro",
"proc-macro-hack",
]
[[package]]
name = "git-version-macro"
version = "0.3.9"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f"
dependencies = [
"proc-macro-hack",
"proc-macro2",
"quote",
"syn 2.0.52",
"syn 1.0.109",
]
[[package]]
@@ -2729,6 +2747,19 @@ dependencies = [
"libc",
]
[[package]]
name = "inotify"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
dependencies = [
"bitflags 1.3.2",
"futures-core",
"inotify-sys",
"libc",
"tokio",
]
[[package]]
name = "inotify-sys"
version = "0.1.5"
@@ -3113,7 +3144,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
dependencies = [
"serde",
"toml",
"toml 0.8.14",
]
[[package]]
@@ -3223,7 +3254,7 @@ dependencies = [
"crossbeam-channel",
"filetime",
"fsevent-sys",
"inotify",
"inotify 0.9.6",
"kqueue",
"libc",
"log",
@@ -3614,6 +3645,7 @@ name = "pagectl"
version = "0.1.0"
dependencies = [
"anyhow",
"bytes",
"camino",
"clap",
"git-version",
@@ -3622,12 +3654,13 @@ dependencies = [
"pageserver_api",
"postgres_ffi",
"remote_storage",
"serde",
"serde_json",
"svg_fmt",
"thiserror",
"tokio",
"tokio-util",
"toml_edit",
"toml_edit 0.19.10",
"utils",
"workspace_hack",
]
@@ -3640,6 +3673,7 @@ dependencies = [
"arc-swap",
"async-compression",
"async-stream",
"async-trait",
"bit_field",
"byteorder",
"bytes",
@@ -3647,13 +3681,16 @@ dependencies = [
"camino-tempfile",
"chrono",
"clap",
"const_format",
"consumption_metrics",
"crc32c",
"criterion",
"crossbeam-utils",
"either",
"enum-map",
"enumset",
"fail",
"flate2",
"futures",
"git-version",
"hex",
@@ -3692,9 +3729,13 @@ dependencies = [
"serde_json",
"serde_path_to_error",
"serde_with",
"signal-hook",
"smallvec",
"storage_broker",
"strum",
"strum_macros",
"svg_fmt",
"sync_wrapper",
"sysinfo",
"tenant_size_model",
"thiserror",
@@ -3706,8 +3747,9 @@ dependencies = [
"tokio-stream",
"tokio-tar",
"tokio-util",
"toml_edit",
"toml_edit 0.19.10",
"tracing",
"twox-hash",
"url",
"utils",
"walkdir",
@@ -3771,22 +3813,44 @@ name = "pageserver_compaction"
version = "0.1.0"
dependencies = [
"anyhow",
"async-compression",
"async-stream",
"byteorder",
"bytes",
"chrono",
"clap",
"const_format",
"consumption_metrics",
"criterion",
"crossbeam-utils",
"either",
"fail",
"flate2",
"futures",
"git-version",
"hex",
"hex-literal",
"humantime",
"humantime-serde",
"itertools 0.10.5",
"metrics",
"once_cell",
"pageserver_api",
"pin-project-lite",
"rand 0.8.5",
"smallvec",
"svg_fmt",
"sync_wrapper",
"thiserror",
"tokio",
"tokio-io-timeout",
"tokio-util",
"tracing",
"tracing-error",
"tracing-subscriber",
"url",
"utils",
"walkdir",
"workspace_hack",
]
@@ -3846,9 +3910,8 @@ dependencies = [
[[package]]
name = "parquet"
version = "53.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8"
version = "51.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
dependencies = [
"ahash",
"bytes",
@@ -3867,9 +3930,8 @@ dependencies = [
[[package]]
name = "parquet_derive"
version = "53.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e"
version = "51.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
dependencies = [
"parquet",
"proc-macro2",
@@ -4059,7 +4121,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4072,7 +4134,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -4091,7 +4153,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4103,7 +4165,9 @@ name = "postgres_backend"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"bytes",
"futures",
"once_cell",
"pq_proto",
"rustls 0.22.4",
@@ -4136,13 +4200,16 @@ version = "0.1.0"
dependencies = [
"anyhow",
"bindgen",
"byteorder",
"bytes",
"crc32c",
"env_logger",
"hex",
"log",
"memoffset 0.8.0",
"once_cell",
"postgres",
"rand 0.8.5",
"regex",
"serde",
"thiserror",
@@ -4177,11 +4244,13 @@ dependencies = [
"byteorder",
"bytes",
"itertools 0.10.5",
"pin-project-lite",
"postgres-protocol",
"rand 0.8.5",
"serde",
"thiserror",
"tokio",
"tracing",
]
[[package]]
@@ -4213,6 +4282,12 @@ dependencies = [
"elliptic-curve 0.13.8",
]
[[package]]
name = "proc-macro-hack"
version = "0.5.20+deprecated"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
[[package]]
name = "proc-macro2"
version = "1.0.78"
@@ -4331,6 +4406,7 @@ dependencies = [
"aws-config",
"aws-sdk-iam",
"aws-sigv4",
"aws-types",
"base64 0.13.1",
"bstr",
"bytes",
@@ -4339,6 +4415,7 @@ dependencies = [
"chrono",
"clap",
"consumption_metrics",
"crossbeam-deque",
"dashmap",
"ecdsa 0.16.9",
"env_logger",
@@ -4364,9 +4441,11 @@ dependencies = [
"jose-jwa",
"jose-jwk",
"lasso",
"md5",
"measured",
"metrics",
"once_cell",
"opentelemetry",
"p256 0.13.2",
"parking_lot 0.12.1",
"parquet",
@@ -4387,6 +4466,7 @@ dependencies = [
"reqwest-middleware",
"reqwest-retry",
"reqwest-tracing",
"routerify",
"rsa",
"rstest",
"rustc-hash",
@@ -4402,6 +4482,7 @@ dependencies = [
"smol_str",
"socket2 0.5.5",
"subtle",
"task-local-extensions",
"thiserror",
"tikv-jemalloc-ctl",
"tikv-jemallocator",
@@ -4411,6 +4492,7 @@ dependencies = [
"tokio-rustls 0.25.0",
"tokio-tungstenite",
"tokio-util",
"tower-service",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
@@ -4700,6 +4782,7 @@ dependencies = [
"async-stream",
"async-trait",
"aws-config",
"aws-credential-types",
"aws-sdk-s3",
"aws-smithy-async",
"aws-smithy-types",
@@ -4713,6 +4796,7 @@ dependencies = [
"futures",
"futures-util",
"http-types",
"humantime",
"humantime-serde",
"hyper 0.14.26",
"itertools 0.10.5",
@@ -4728,7 +4812,7 @@ dependencies = [
"tokio",
"tokio-stream",
"tokio-util",
"toml_edit",
"toml_edit 0.19.10",
"tracing",
"utils",
]
@@ -5192,12 +5276,14 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-stream",
"async-trait",
"byteorder",
"bytes",
"camino",
"camino-tempfile",
"chrono",
"clap",
"const_format",
"crc32c",
"desim",
"fail",
@@ -5223,7 +5309,9 @@ dependencies = [
"sd-notify",
"serde",
"serde_json",
"serde_with",
"sha2",
"signal-hook",
"storage_broker",
"strum",
"strum_macros",
@@ -5234,6 +5322,7 @@ dependencies = [
"tokio-stream",
"tokio-tar",
"tokio-util",
"toml_edit 0.19.10",
"tracing",
"tracing-subscriber",
"url",
@@ -5248,6 +5337,7 @@ version = "0.1.0"
dependencies = [
"const_format",
"serde",
"serde_with",
"utils",
]
@@ -5641,6 +5731,17 @@ dependencies = [
"signal-hook-registry",
]
[[package]]
name = "signal-hook-mio"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
dependencies = [
"libc",
"mio",
"signal-hook",
]
[[package]]
name = "signal-hook-registry"
version = "1.4.1"
@@ -5776,6 +5877,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-stream",
"bytes",
"clap",
"const_format",
"futures",
@@ -5789,6 +5891,7 @@ dependencies = [
"parking_lot 0.12.1",
"prost",
"tokio",
"tokio-stream",
"tonic",
"tonic-build",
"tracing",
@@ -5801,7 +5904,9 @@ name = "storage_controller"
version = "0.1.0"
dependencies = [
"anyhow",
"aws-config",
"bytes",
"camino",
"chrono",
"clap",
"control_plane",
@@ -5842,9 +5947,20 @@ dependencies = [
name = "storage_controller_client"
version = "0.1.0"
dependencies = [
"anyhow",
"bytes",
"futures",
"pageserver_api",
"pageserver_client",
"postgres",
"reqwest 0.12.4",
"serde",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-stream",
"tokio-util",
"utils",
"workspace_hack",
]
@@ -5856,9 +5972,13 @@ dependencies = [
"async-stream",
"aws-config",
"aws-sdk-s3",
"aws-smithy-async",
"bincode",
"bytes",
"camino",
"chrono",
"clap",
"crc32c",
"either",
"futures",
"futures-util",
@@ -5870,16 +5990,20 @@ dependencies = [
"pageserver",
"pageserver_api",
"postgres_ffi",
"rand 0.8.5",
"remote_storage",
"reqwest 0.12.4",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"serde",
"serde_json",
"serde_with",
"storage_controller_client",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.25.0",
"tokio-stream",
"tokio-util",
"tracing",
@@ -5898,11 +6022,14 @@ dependencies = [
"comfy-table",
"futures",
"humantime",
"hyper 0.14.26",
"pageserver_api",
"pageserver_client",
"reqwest 0.12.4",
"serde",
"serde_json",
"storage_controller_client",
"thiserror",
"tokio",
"tracing",
"utils",
@@ -5927,21 +6054,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "strum"
version = "0.26.3"
version = "0.24.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"
[[package]]
name = "strum_macros"
version = "0.26.4"
version = "0.24.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
dependencies = [
"heck 0.5.0",
"heck 0.4.1",
"proc-macro2",
"quote",
"rustversion",
"syn 2.0.52",
"syn 1.0.109",
]
[[package]]
@@ -6025,6 +6152,15 @@ dependencies = [
"xattr",
]
[[package]]
name = "task-local-extensions"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8"
dependencies = [
"pin-utils",
]
[[package]]
name = "tempfile"
version = "3.9.0"
@@ -6273,7 +6409,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"async-trait",
"byteorder",
@@ -6384,6 +6520,18 @@ dependencies = [
"tracing",
]
[[package]]
name = "toml"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec"
dependencies = [
"serde",
"serde_spanned",
"toml_datetime",
"toml_edit 0.19.10",
]
[[package]]
name = "toml"
version = "0.8.14"
@@ -6393,7 +6541,7 @@ dependencies = [
"serde",
"serde_spanned",
"toml_datetime",
"toml_edit",
"toml_edit 0.22.14",
]
[[package]]
@@ -6405,6 +6553,19 @@ dependencies = [
"serde",
]
[[package]]
name = "toml_edit"
version = "0.19.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
dependencies = [
"indexmap 1.9.3",
"serde",
"serde_spanned",
"toml_datetime",
"winnow 0.4.6",
]
[[package]]
name = "toml_edit"
version = "0.22.14"
@@ -6415,7 +6576,7 @@ dependencies = [
"serde",
"serde_spanned",
"toml_datetime",
"winnow",
"winnow 0.6.13",
]
[[package]]
@@ -6615,6 +6776,7 @@ dependencies = [
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry-semantic-conventions",
"reqwest 0.12.4",
"tokio",
"tracing",
"tracing-opentelemetry",
@@ -6818,6 +6980,7 @@ dependencies = [
"serde_assert",
"serde_json",
"serde_path_to_error",
"serde_with",
"signal-hook",
"strum",
"strum_macros",
@@ -6826,7 +6989,7 @@ dependencies = [
"tokio-stream",
"tokio-tar",
"tokio-util",
"toml_edit",
"toml_edit 0.19.10",
"tracing",
"tracing-error",
"tracing-subscriber",
@@ -6873,11 +7036,13 @@ dependencies = [
"cgroups-rs",
"clap",
"futures",
"inotify 0.10.2",
"serde",
"serde_json",
"sysinfo",
"tokio",
"tokio-postgres",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
@@ -6904,6 +7069,7 @@ dependencies = [
"clap",
"env_logger",
"log",
"once_cell",
"postgres",
"postgres_ffi",
"regex",
@@ -7369,6 +7535,15 @@ version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
[[package]]
name = "winnow"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699"
dependencies = [
"memchr",
]
[[package]]
name = "winnow"
version = "0.6.13"
@@ -7426,7 +7601,6 @@ dependencies = [
"digest",
"either",
"fail",
"futures",
"futures-channel",
"futures-executor",
"futures-io",
@@ -7477,13 +7651,10 @@ dependencies = [
"tokio",
"tokio-rustls 0.24.0",
"tokio-util",
"toml_edit",
"tonic",
"tower",
"tracing",
"tracing-core",
"tracing-log",
"tracing-subscriber",
"url",
"uuid",
"zeroize",

View File

@@ -73,7 +73,7 @@ camino = "1.1.6"
cfg-if = "1.0.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive"] }
comfy-table = "7.1"
comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
crossbeam-deque = "0.8.5"
@@ -123,8 +123,8 @@ opentelemetry = "0.20.0"
opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.12.0"
parking_lot = "0.12"
parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "51.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
procfs = "0.16"
@@ -158,8 +158,8 @@ signal-hook = "0.3"
smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
strum = "0.26"
strum_macros = "0.26"
strum = "0.24"
strum_macros = "0.24"
"subtle" = "2.5.0"
svg_fmt = "0.4.3"
sync_wrapper = "0.1.2"
@@ -177,8 +177,8 @@ tokio-rustls = "0.25"
tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
toml = "0.8"
toml_edit = "0.22"
toml = "0.7"
toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tower-service = "0.3.2"
tracing = "0.1"
@@ -201,21 +201,10 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
# We want to use the 'neon' branch for these, but there's currently one
# incompatible change on the branch. See:
#
# - PR #8076 which contained changes that depended on the new changes in
# the rust-postgres crate, and
# - PR #8654 which reverted those changes and made the code in proxy incompatible
# with the tip of the 'neon' branch again.
#
# When those proxy changes are re-applied (see PR #8747), we can switch using
# the tip of the 'neon' branch again.
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
## Local libraries
compute_api = { version = "0.1", path = "./libs/compute_api/" }
@@ -252,7 +241,11 @@ tonic-build = "0.9"
[patch.crates-io]
# Needed to get `tokio-postgres-rustls` to depend on our fork.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
# bug fixes for UUID
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
################# Binary contents sections

View File

@@ -5,8 +5,6 @@
ARG REPOSITORY=neondatabase
ARG IMAGE=build-tools
ARG TAG=pinned
ARG DEFAULT_PG_VERSION=17
ARG STABLE_PG_VERSION=16
# Build Postgres
FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
@@ -15,7 +13,6 @@ WORKDIR /home/nonroot
COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
COPY --chown=nonroot pgxn pgxn
COPY --chown=nonroot Makefile Makefile
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -31,19 +28,16 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build
WORKDIR /home/nonroot
ARG GIT_VERSION=local
ARG BUILD_TAG
ARG STABLE_PG_VERSION
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib
COPY --chown=nonroot . .
ARG ADDITIONAL_RUSTFLAGS
RUN set -e \
&& PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \
@@ -58,7 +52,6 @@ RUN set -e \
# Build final image
#
FROM debian:bullseye-slim
ARG DEFAULT_PG_VERSION
WORKDIR /data
RUN set -e \
@@ -84,7 +77,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubbe
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
@@ -101,7 +93,7 @@ RUN mkdir -p /data/.neon/ && \
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
# that want a particular postgres version will select it explicitly: this is just a default.
ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
ENV LD_LIBRARY_PATH=/usr/local/v16/lib
VOLUME ["/data"]

View File

@@ -55,27 +55,22 @@ RUN cd postgres && \
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
# so we do it here.
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
# the first loop is for pg_stat_statement extension version <= 1.6
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
# Note that there are no downgrade scripts for pg_stat_statements, so we \
# don't have to modify any downgrade paths or (much) older versions: we only \
# have to make sure every creation of the pg_stat_statements_reset function \
# also adds execute permissions to the neon_superuser.
case $filename in \
pg_stat_statements--1.4.sql) \
# pg_stat_statements_reset is first created with 1.4
if echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
;; \
pg_stat_statements--1.6--1.7.sql) \
# Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back
fi; \
done; \
# the second loop is for pg_stat_statement extension versions >= 1.7,
# where pg_stat_statement_reset() got 3 additional arguments
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
if ! echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
;; \
pg_stat_statements--1.10--1.11.sql) \
# Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \
;; \
esac; \
done;
fi; \
done
#########################################################################################
#
@@ -84,7 +79,6 @@ RUN cd postgres && \
#
#########################################################################################
FROM build-deps AS postgis-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
@@ -93,11 +87,7 @@ RUN apt update && \
protobuf-c-compiler xsltproc
# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
RUN case "${PG_VERSION}" in "v17") \
mkdir -p /sfcgal && \
echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
esac && \
wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -106,10 +96,7 @@ RUN case "${PG_VERSION}" in "v17") \
ENV PATH="/usr/local/pgsql/bin:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
esac && \
wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
@@ -135,10 +122,7 @@ RUN case "${PG_VERSION}" in "v17") \
cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
mkdir build && cd build && \
@@ -158,19 +142,12 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS plv8-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
apt update && \
RUN apt update && \
apt install -y ninja-build python3-dev libncurses5 binutils clang
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
# generate and copy upgrade scripts
@@ -195,13 +172,9 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS h3-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
case "$(uname -m)" in \
RUN case "$(uname -m)" in \
"x86_64") \
export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
;; \
@@ -219,11 +192,7 @@ RUN case "${PG_VERSION}" in "v17") \
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
&& rm /tmp/cmake-install.sh
RUN case "${PG_VERSION}" in "v17") \
mkdir -p /h3/usr/ && \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
mkdir build && cd build && \
@@ -233,10 +202,7 @@ RUN case "${PG_VERSION}" in "v17") \
cp -R /h3/usr / && \
rm -rf build
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
@@ -252,13 +218,9 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS unit-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -277,7 +239,6 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS vector-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY patches/pgvector.patch /pgvector.patch
@@ -285,10 +246,7 @@ COPY patches/pgvector.patch /pgvector.patch
# By default, pgvector Makefile uses `-march=native`. We don't want that,
# because we build the images on different machines than where we run them.
# Pass OPTFLAGS="" to remove it.
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
patch -p1 < /pgvector.patch && \
@@ -303,14 +261,10 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS pgjwt-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -323,13 +277,9 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS hypopg-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -343,13 +293,9 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS pg-hashids-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -363,15 +309,11 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS rum-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY patches/rum.patch /rum.patch
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
patch -p1 < /rum.patch && \
@@ -386,13 +328,9 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS pgtap-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -406,13 +344,9 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS ip4r-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -426,13 +360,9 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS prefix-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -446,13 +376,9 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS hll-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -466,13 +392,9 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS plpgsql-check-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -491,10 +413,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ARG PG_VERSION
ENV PATH="/usr/local/pgsql/bin:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
case "${PG_VERSION}" in \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export TIMESCALEDB_VERSION=2.10.1 \
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
@@ -527,10 +446,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ARG PG_VERSION
ENV PATH="/usr/local/pgsql/bin:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
case "${PG_VERSION}" in \
RUN case "${PG_VERSION}" in \
"v14") \
export PG_HINT_PLAN_VERSION=14_1_4_1 \
export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
@@ -543,9 +459,6 @@ RUN case "${PG_VERSION}" in "v17") \
export PG_HINT_PLAN_VERSION=16_1_6_0 \
export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
;; \
"v17") \
echo "TODO: PG17 pg_hint_plan support" && exit 0 \
;; \
*) \
echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
;; \
@@ -565,14 +478,10 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS pg-cron-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -586,13 +495,9 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS rdkit-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
apt-get update && \
RUN apt-get update && \
apt-get install -y \
cmake \
libboost-iostreams1.74-dev \
@@ -602,10 +507,7 @@ RUN case "${PG_VERSION}" in "v17") \
libeigen3-dev
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
cmake \
@@ -642,14 +544,10 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS pg-uuidv7-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -663,14 +561,10 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS pg-roaringbitmap-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions is not supported yet by pg_roaringbitmap. Quit" && exit 0;; \
esac && \
wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -684,14 +578,10 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS pg-semver-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "v17 is not supported yet by pg_semver. Quit" && exit 0;; \
esac && \
wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -730,14 +620,10 @@ RUN case "${PG_VERSION}" in \
#
#########################################################################################
FROM build-deps AS pg-anon-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
esac && \
wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
@@ -755,7 +641,6 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS rust-extensions-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt-get update && \
@@ -766,11 +651,9 @@ ENV HOME=/home/nonroot
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
USER nonroot
WORKDIR /home/nonroot
ARG PG_VERSION
RUN case "${PG_VERSION}" in "v17") \
echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
esac && \
curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \
@@ -789,10 +672,7 @@ USER root
FROM rust-extensions-build AS pg-jsonschema-pg-build
ARG PG_VERSION
RUN case "${PG_VERSION}" in "v17") \
echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \
esac && \
wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
# see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
@@ -814,10 +694,7 @@ RUN case "${PG_VERSION}" in "v17") \
FROM rust-extensions-build AS pg-graphql-pg-build
ARG PG_VERSION
RUN case "${PG_VERSION}" in "v17") \
echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \
esac && \
wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -837,10 +714,7 @@ FROM rust-extensions-build AS pg-tiktoken-pg-build
ARG PG_VERSION
# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
RUN case "${PG_VERSION}" in "v17") \
echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \
esac && \
wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
# TODO update pgrx version in the pg_tiktoken repo and remove this line
@@ -859,10 +733,7 @@ RUN case "${PG_VERSION}" in "v17") \
FROM rust-extensions-build AS pg-pgx-ulid-build
ARG PG_VERSION
RUN case "${PG_VERSION}" in "v17") \
echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
esac && \
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -877,14 +748,10 @@ RUN case "${PG_VERSION}" in "v17") \
#########################################################################################
FROM build-deps AS wal2json-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \
esac && \
wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -897,14 +764,10 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS pg-ivm-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \
esac && \
wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -918,14 +781,10 @@ RUN case "${PG_VERSION}" in "v17") \
#
#########################################################################################
FROM build-deps AS pg-partman-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "pg_partman doesn't support PG17 yet" && exit 0;; \
esac && \
wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -995,8 +854,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16" | "v17") \
echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
"v16") \
echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
@@ -1040,7 +899,7 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
# Remove headers that we won't need anymore - we've completed installation of all extensions
RUN rm -r /usr/local/pgsql/include
@@ -1059,10 +918,7 @@ RUN rm /usr/local/pgsql/lib/lib*.a
FROM neon-pg-ext-build AS neon-pg-ext-test
ARG PG_VERSION
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
mkdir /ext-src
RUN mkdir /ext-src
#COPY --from=postgis-build /postgis.tar.gz /ext-src/
#COPY --from=postgis-build /sfcgal/* /usr
@@ -1100,39 +956,18 @@ COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
COPY patches/pg_anon.patch /ext-src
COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
cd /ext-src/ && for f in *.tar.gz; \
RUN cd /ext-src/ && for f in *.tar.gz; \
do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
|| exit 1; rm -f $f; done
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
cd /ext-src/rum-src && patch -p1 <../rum.patch
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
# cmake is required for the h3 test
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
apt-get update && apt-get install -y cmake
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
RUN apt-get update && apt-get install -y cmake
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
patch -p1 </ext-src/pg_anon.patch
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
patch -p1 </ext-src/pg_cron.patch
RUN patch -p1 </ext-src/pg_anon.patch
RUN patch -p1 </ext-src/pg_cron.patch
ENV PATH=/usr/local/pgsql/bin:$PATH
ENV PGHOST=compute
ENV PGPORT=55433

View File

@@ -119,8 +119,6 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
# I'm not sure why it wouldn't work, but this is the only place (apart from
# the "build-all-versions" entry points) where direct mention of PostgreSQL
# versions is used.
.PHONY: postgres-configure-v17
postgres-configure-v17: $(POSTGRES_INSTALL_DIR)/build/v17/config.status
.PHONY: postgres-configure-v16
postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
.PHONY: postgres-configure-v15
@@ -217,31 +215,29 @@ neon-pg-clean-ext-%:
# they depend on openssl and other libraries that are not included in our
# Rust build.
.PHONY: walproposer-lib
walproposer-lib: neon-pg-ext-v17
walproposer-lib: neon-pg-ext-v16
+@echo "Compiling walproposer-lib"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
ifeq ($(UNAME_S),Linux)
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
pg_strong_random.o
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
checksum_helper.o \
cryptohash_openssl.o \
pg_crc32c.o \
hmac_openssl.o \
cryptohash_openssl.o \
scram-common.o \
md5_common.o \
parse_manifest.o \
scram-common.o
ifeq ($(UNAME_S),Linux)
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
pg_crc32c.o
checksum_helper.o
endif
.PHONY: walproposer-lib-clean
walproposer-lib-clean:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
@@ -249,44 +245,38 @@ walproposer-lib-clean:
neon-pg-ext: \
neon-pg-ext-v14 \
neon-pg-ext-v15 \
neon-pg-ext-v16 \
neon-pg-ext-v17
neon-pg-ext-v16
.PHONY: neon-pg-clean-ext
neon-pg-clean-ext: \
neon-pg-clean-ext-v14 \
neon-pg-clean-ext-v15 \
neon-pg-clean-ext-v16 \
neon-pg-clean-ext-v17
neon-pg-clean-ext-v16
# shorthand to build all Postgres versions
.PHONY: postgres
postgres: \
postgres-v14 \
postgres-v15 \
postgres-v16 \
postgres-v17
postgres-v16
.PHONY: postgres-headers
postgres-headers: \
postgres-headers-v14 \
postgres-headers-v15 \
postgres-headers-v16 \
postgres-headers-v17
postgres-headers-v16
.PHONY: postgres-clean
postgres-clean: \
postgres-clean-v14 \
postgres-clean-v15 \
postgres-clean-v16 \
postgres-clean-v17
postgres-clean-v16
.PHONY: postgres-check
postgres-check: \
postgres-check-v14 \
postgres-check-v15 \
postgres-check-v16 \
postgres-check-v17
postgres-check-v16
# This doesn't remove the effects of 'configure'.
.PHONY: clean
@@ -331,13 +321,13 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
rm -f pg*.BAK
# Indent pxgn/neon.
.PHONY: neon-pgindent
neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
INDENT=$(POSTGRES_INSTALL_DIR)/build/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
-C $(POSTGRES_INSTALL_DIR)/build/neon-v17 \
.PHONY: pgindent
neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent

View File

@@ -11,6 +11,7 @@ testing = []
[dependencies]
anyhow.workspace = true
async-compression.workspace = true
chrono.workspace = true
cfg-if.workspace = true
clap.workspace = true
@@ -23,6 +24,7 @@ num_cpus.workspace = true
opentelemetry.workspace = true
postgres.workspace = true
regex.workspace = true
serde.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
tar.workspace = true
@@ -41,6 +43,7 @@ url.workspace = true
compute_api.workspace = true
utils.workspace = true
workspace_hack.workspace = true
toml_edit.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
zstd = "0.13"

View File

@@ -1052,19 +1052,26 @@ impl ComputeNode {
let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
let config_time = Utc::now();
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
let pgdata_path = Path::new(&self.pgdata);
// temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are applying config:
// creating new extensions, roles, etc...
config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
if pspec.spec.mode == ComputeMode::Primary {
if !pspec.spec.skip_pg_catalog_updates {
let pgdata_path = Path::new(&self.pgdata);
// temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are applying config:
// creating new extensions, roles, etc...
config::with_compute_ctl_tmp_override(
pgdata_path,
"neon.max_cluster_size=-1",
|| {
self.pg_reload_conf()?;
self.apply_config(&compute_state)?;
Ok(())
},
)?;
self.pg_reload_conf()?;
self.apply_config(&compute_state)?;
Ok(())
})?;
self.pg_reload_conf()?;
}
self.post_apply_config()?;
}
let startup_end_time = Utc::now();

View File

@@ -124,7 +124,6 @@ fn parse_pg_version(human_version: &str) -> &str {
"14" => return "v14",
"15" => return "v15",
"16" => return "v16",
"17" => return "v17",
_ => {}
},
_ => {}

View File

@@ -1 +0,0 @@
GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;

View File

@@ -793,9 +793,6 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
include_str!(
"./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
),
include_str!(
"./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
),
];
MigrationRunner::new(client, &migrations).run_migrations()?;

View File

@@ -9,10 +9,13 @@ anyhow.workspace = true
camino.workspace = true
clap.workspace = true
comfy-table.workspace = true
futures.workspace = true
git-version.workspace = true
humantime.workspace = true
nix.workspace = true
once_cell.workspace = true
postgres.workspace = true
hex.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
regex.workspace = true
@@ -20,6 +23,8 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
scopeguard.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_with.workspace = true
tar.workspace = true
thiserror.workspace = true
toml.workspace = true
toml_edit.workspace = true

View File

@@ -151,7 +151,7 @@ where
print!(".");
io::stdout().flush().unwrap();
}
tokio::time::sleep(RETRY_INTERVAL).await;
thread::sleep(RETRY_INTERVAL);
}
Err(e) => {
println!("error starting process {process_name:?}: {e:#}");

View File

@@ -34,14 +34,12 @@ use safekeeper_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
};
use std::borrow::Cow;
use std::collections::{BTreeSet, HashMap};
use std::path::PathBuf;
use std::process::exit;
use std::str::FromStr;
use std::time::Duration;
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
use tokio::task::JoinSet;
use url::Host;
use utils::{
auth::{Claims, Scope},
@@ -89,35 +87,34 @@ fn main() -> Result<()> {
// Check for 'neon init' command first.
let subcommand_result = if sub_name == "init" {
handle_init(sub_args).map(|env| Some(Cow::Owned(env)))
handle_init(sub_args).map(Some)
} else {
// all other commands need an existing config
let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
let mut env =
LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
let original_env = env.clone();
let env = Box::leak(Box::new(env));
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let subcommand_result = match sub_name {
"tenant" => rt.block_on(handle_tenant(sub_args, env)),
"timeline" => rt.block_on(handle_timeline(sub_args, env)),
"start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))),
"stop" => rt.block_on(handle_stop_all(sub_args, env)),
"pageserver" => rt.block_on(handle_pageserver(sub_args, env)),
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)),
"storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)),
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)),
"endpoint" => rt.block_on(handle_endpoint(sub_args, env)),
"mappings" => handle_mappings(sub_args, env),
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
"mappings" => handle_mappings(sub_args, &mut env),
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
_ => bail!("unexpected subcommand {sub_name}"),
};
if &original_env != env {
subcommand_result.map(|()| Some(Cow::Borrowed(env)))
if original_env != env {
subcommand_result.map(|()| Some(env))
} else {
subcommand_result.map(|()| None)
}
@@ -643,8 +640,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
}
Some(("branch", branch_match)) => {
let tenant_id = get_tenant_id(branch_match, env)?;
let new_timeline_id =
parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate());
let new_branch_name = branch_match
.get_one::<String>("branch-name")
.ok_or_else(|| anyhow!("No branch name provided"))?;
@@ -663,6 +658,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
.map(|lsn_str| Lsn::from_str(lsn_str))
.transpose()
.context("Failed to parse ancestor start Lsn from the request")?;
let new_timeline_id = TimelineId::generate();
let storage_controller = StorageController::from_env(env);
let create_req = TimelineCreateRequest {
new_timeline_id,
@@ -1248,122 +1244,49 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
Ok(())
}
async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let (sub_name, sub_args) = match sub_match.subcommand() {
Some(broker_command_data) => broker_command_data,
None => bail!("no broker subcommand provided"),
};
match sub_name {
"start" => {
if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await {
eprintln!("broker start failed: {e}");
exit(1);
}
}
"stop" => {
if let Err(e) = broker::stop_broker_process(env) {
eprintln!("broker stop failed: {e}");
exit(1);
}
}
_ => bail!("Unexpected broker subcommand '{}'", sub_name),
}
Ok(())
}
async fn handle_start_all(
env: &'static local_env::LocalEnv,
env: &local_env::LocalEnv,
retry_timeout: &Duration,
) -> anyhow::Result<()> {
let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else {
neon_start_status_check(env, retry_timeout)
.await
.context("status check after successful startup of all services")?;
return Ok(());
};
eprintln!("startup failed because one or more services could not be started");
for e in errors {
eprintln!("{e}");
let debug_repr = format!("{e:?}");
for line in debug_repr.lines() {
eprintln!(" {line}");
}
}
try_stop_all(env, true).await;
exit(2);
}
/// Returns Ok() if and only if all services could be started successfully.
/// Otherwise, returns the list of errors that occurred during startup.
async fn handle_start_all_impl(
env: &'static local_env::LocalEnv,
retry_timeout: Duration,
) -> Result<(), Vec<anyhow::Error>> {
// Endpoints are not started automatically
let mut js = JoinSet::new();
broker::start_broker_process(env, retry_timeout).await?;
// force infalliblity through closure
#[allow(clippy::redundant_closure_call)]
(|| {
js.spawn(async move {
let retry_timeout = retry_timeout;
broker::start_broker_process(env, &retry_timeout).await
});
// Only start the storage controller if the pageserver is configured to need it
if env.control_plane_api.is_some() {
js.spawn(async move {
let storage_controller = StorageController::from_env(env);
storage_controller
.start(NeonStorageControllerStartArgs::with_default_instance_id(
retry_timeout.into(),
))
.await
.map_err(|e| e.context("start storage_controller"))
});
}
for ps_conf in &env.pageservers {
js.spawn(async move {
let pageserver = PageServerNode::from_env(env, ps_conf);
pageserver
.start(&retry_timeout)
.await
.map_err(|e| e.context(format!("start pageserver {}", ps_conf.id)))
});
}
for node in env.safekeepers.iter() {
js.spawn(async move {
let safekeeper = SafekeeperNode::from_env(env, node);
safekeeper
.start(vec![], &retry_timeout)
.await
.map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
});
}
})();
let mut errors = Vec::new();
while let Some(result) = js.join_next().await {
let result = result.expect("we don't panic or cancel the tasks");
if let Err(e) = result {
errors.push(e);
// Only start the storage controller if the pageserver is configured to need it
if env.control_plane_api.is_some() {
let storage_controller = StorageController::from_env(env);
if let Err(e) = storage_controller
.start(NeonStorageControllerStartArgs::with_default_instance_id(
(*retry_timeout).into(),
))
.await
{
eprintln!("storage_controller start failed: {:#}", e);
try_stop_all(env, true).await;
exit(1);
}
}
if !errors.is_empty() {
return Err(errors);
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.start(retry_timeout).await {
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true).await;
exit(1);
}
}
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false).await;
exit(1);
}
}
neon_start_status_check(env, retry_timeout).await?;
Ok(())
}
@@ -1647,6 +1570,7 @@ fn cli() -> Command {
.value_parser(value_parser!(PathBuf))
.value_name("config")
)
.arg(pg_version_arg.clone())
.arg(force_arg)
)
.subcommand(
@@ -1659,7 +1583,6 @@ fn cli() -> Command {
.subcommand(Command::new("branch")
.about("Create a new timeline, using another timeline as a base, copying its data")
.arg(tenant_id_arg.clone())
.arg(timeline_id_arg.clone())
.arg(branch_name_arg.clone())
.arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
.help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
@@ -1748,19 +1671,6 @@ fn cli() -> Command {
.arg(stop_mode_arg.clone())
.arg(instance_id))
)
.subcommand(
Command::new("storage_broker")
.arg_required_else_help(true)
.about("Manage broker")
.subcommand(Command::new("start")
.about("Start broker")
.arg(timeout_arg.clone())
)
.subcommand(Command::new("stop")
.about("Stop broker")
.arg(stop_mode_arg.clone())
)
)
.subcommand(
Command::new("safekeeper")
.arg_required_else_help(true)

View File

@@ -702,7 +702,7 @@ impl Endpoint {
}
}
}
tokio::time::sleep(ATTEMPT_INTERVAL).await;
std::thread::sleep(ATTEMPT_INTERVAL);
}
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)

View File

@@ -342,7 +342,7 @@ impl LocalEnv {
#[allow(clippy::manual_range_patterns)]
match pg_version {
14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
_ => bail!("Unsupported postgres version: {}", pg_version),
}
}

View File

@@ -17,7 +17,9 @@ use std::time::Duration;
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
use pageserver_api::models::{
self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
@@ -73,14 +75,14 @@ impl PageServerNode {
}
}
fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut {
toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap()
fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
}
fn pageserver_init_make_toml(
&self,
conf: NeonLocalInitPageserverConf,
) -> anyhow::Result<toml_edit::DocumentMut> {
) -> anyhow::Result<toml_edit::Document> {
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
@@ -135,9 +137,9 @@ impl PageServerNode {
// Turn `overrides` into a toml document.
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
let mut config_toml = toml_edit::DocumentMut::new();
let mut config_toml = toml_edit::Document::new();
for fragment_str in overrides {
let fragment = toml_edit::DocumentMut::from_str(&fragment_str)
let fragment = toml_edit::Document::from_str(&fragment_str)
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
for (key, item) in fragment.iter() {
config_toml.insert(key, item.clone());
@@ -322,6 +324,22 @@ impl PageServerNode {
background_process::stop_process(immediate, "pageserver", &self.pid_file())
}
pub async fn page_server_psql_client(
&self,
) -> anyhow::Result<(
tokio_postgres::Client,
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
)> {
let mut config = self.pg_connection_config.clone();
if self.conf.pg_auth_type == AuthType::NeonJWT {
let token = self
.env
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
config = config.set_password(Some(token));
}
Ok(config.connect_no_tls().await?)
}
pub async fn check_status(&self) -> mgmt_api::Result<()> {
self.http_client.status().await
}
@@ -522,6 +540,19 @@ impl PageServerNode {
Ok(())
}
pub async fn location_config(
&self,
tenant_shard_id: TenantShardId,
config: LocationConfig,
flush_ms: Option<Duration>,
lazy: bool,
) -> anyhow::Result<()> {
Ok(self
.http_client
.location_config(tenant_shard_id, config, flush_ms, lazy)
.await?)
}
pub async fn timeline_list(
&self,
tenant_shard_id: &TenantShardId,
@@ -605,4 +636,14 @@ impl PageServerNode {
Ok(())
}
pub async fn tenant_synthetic_size(
&self,
tenant_shard_id: TenantShardId,
) -> anyhow::Result<TenantHistorySize> {
Ok(self
.http_client
.tenant_synthetic_size(tenant_shard_id)
.await?)
}
}

View File

@@ -4,10 +4,13 @@
/// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
/// enough to extract a few settings we need in Neon, assuming you don't do
/// funny stuff like include-directives or funny escaping.
use anyhow::{bail, Context, Result};
use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::HashMap;
use std::fmt;
use std::io::BufRead;
use std::str::FromStr;
/// In-memory representation of a postgresql.conf file
#[derive(Default, Debug)]
@@ -16,16 +19,84 @@ pub struct PostgresConf {
hash: HashMap<String, String>,
}
static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
impl PostgresConf {
pub fn new() -> PostgresConf {
PostgresConf::default()
}
/// Read file into memory
pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
let mut result = Self::new();
for line in std::io::BufReader::new(read).lines() {
let line = line?;
// Store each line in a vector, in original format
result.lines.push(line.clone());
// Also parse each line and insert key=value lines into a hash map.
//
// FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
// But it's close enough for our usage.
let line = line.trim();
if line.starts_with('#') {
// comment, ignore
continue;
} else if let Some(caps) = CONF_LINE_RE.captures(line) {
let name = caps.get(1).unwrap().as_str();
let raw_val = caps.get(2).unwrap().as_str();
if let Ok(val) = deescape_str(raw_val) {
// Note: if there's already an entry in the hash map for
// this key, this will replace it. That's the behavior what
// we want; when PostgreSQL reads the file, each line
// overrides any previous value for the same setting.
result.hash.insert(name.to_string(), val.to_string());
}
}
}
Ok(result)
}
/// Return the current value of 'option'
pub fn get(&self, option: &str) -> Option<&str> {
self.hash.get(option).map(|x| x.as_ref())
}
/// Return the current value of a field, parsed to the right datatype.
///
/// This calls the FromStr::parse() function on the value of the field. If
/// the field does not exist, or parsing fails, returns an error.
///
pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
where
T: FromStr,
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
{
self.get(field_name)
.with_context(|| format!("could not find '{}' option {}", field_name, context))?
.parse::<T>()
.with_context(|| format!("could not parse '{}' option {}", field_name, context))
}
pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
where
T: FromStr,
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
{
if let Some(val) = self.get(field_name) {
let result = val
.parse::<T>()
.with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
Ok(Some(result))
} else {
Ok(None)
}
}
///
/// Note: if you call this multiple times for the same option, the config
/// file will a line for each call. It would be nice to have a function
@@ -83,8 +154,48 @@ fn escape_str(s: &str) -> String {
}
}
/// De-escape a possibly-quoted value.
///
/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
/// does this.
fn deescape_str(s: &str) -> Result<String> {
// If the string has a quote at the beginning and end, strip them out.
if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
let mut result = String::new();
let mut iter = s[1..(s.len() - 1)].chars().peekable();
while let Some(c) = iter.next() {
let newc = if c == '\\' {
match iter.next() {
Some('b') => '\x08',
Some('f') => '\x0c',
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('0'..='7') => {
// TODO
bail!("octal escapes not supported");
}
Some(n) => n,
None => break,
}
} else if c == '\'' && iter.peek() == Some(&'\'') {
// doubled quote becomes just one quote
iter.next().unwrap()
} else {
c
};
result.push(newc);
}
Ok(result)
} else {
Ok(s.to_string())
}
}
#[test]
fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
fn test_postgresql_conf_escapes() -> Result<()> {
assert_eq!(escape_str("foo bar"), "'foo bar'");
// these don't need to be quoted
assert_eq!(escape_str("foo"), "foo");
@@ -103,5 +214,13 @@ fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
assert_eq!(escape_str("10 cats"), "'10 cats'");
// Test de-escaping
assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
// octal-escapes are currently not supported
assert!(deescape_str("'foo\\7\\07\\007'").is_err());
Ok(())
}

View File

@@ -28,7 +28,6 @@ use utils::{
auth::{encode_from_key_file, Claims, Scope},
id::{NodeId, TenantId},
};
use whoami::username;
pub struct StorageController {
env: LocalEnv,
@@ -184,7 +183,7 @@ impl StorageController {
/// to other versions if that one isn't found. Some automated tests create circumstances
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14];
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
for v in prefer_versions {
let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
@@ -212,16 +211,7 @@ impl StorageController {
/// Readiness check for our postgres process
async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
let bin_path = pg_bin_dir.join("pg_isready");
let args = [
"-h",
"localhost",
"-U",
&username(),
"-d",
DB_NAME,
"-p",
&format!("{}", postgres_port),
];
let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
Ok(exitcode.success())
@@ -235,11 +225,7 @@ impl StorageController {
///
/// Returns the database url
pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
let database_url = format!(
"postgresql://{}@localhost:{}/{DB_NAME}",
&username(),
postgres_port
);
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
let pg_bin_dir = self.get_pg_bin_dir().await?;
let createdb_path = pg_bin_dir.join("createdb");
@@ -249,10 +235,6 @@ impl StorageController {
"localhost",
"-p",
&format!("{}", postgres_port),
"-U",
&username(),
"-O",
&username(),
DB_NAME,
])
.output()
@@ -289,7 +271,7 @@ impl StorageController {
// But tokio-postgres fork doesn't have this upstream commit:
// https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
// => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
.user(&username())
.user(&whoami::username())
.dbname(DB_NAME)
.connect(tokio_postgres::NoTls)
.await
@@ -346,12 +328,6 @@ impl StorageController {
let pg_log_path = pg_data_path.join("postgres.log");
if !tokio::fs::try_exists(&pg_data_path).await? {
let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()];
tracing::info!(
"Initializing storage controller database with args: {:?}",
initdb_args
);
// Initialize empty database
let initdb_path = pg_bin_dir.join("initdb");
let mut child = Command::new(&initdb_path)
@@ -359,7 +335,7 @@ impl StorageController {
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
])
.args(initdb_args)
.args(["-D", pg_data_path.as_ref()])
.spawn()
.expect("Failed to spawn initdb");
let status = child.wait().await?;
@@ -388,14 +364,8 @@ impl StorageController {
pg_data_path.as_ref(),
"-l",
pg_log_path.as_ref(),
"-U",
&username(),
"start",
];
tracing::info!(
"Starting storage controller database with args: {:?}",
db_start_args
);
background_process::start_process(
"storage_controller_db",

View File

@@ -11,11 +11,14 @@ clap.workspace = true
comfy-table.workspace = true
futures.workspace = true
humantime.workspace = true
hyper.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
storage_controller_client.workspace = true
thiserror.workspace = true
tokio.workspace = true
tracing.workspace = true
utils.workspace = true

View File

@@ -8,6 +8,7 @@ license.workspace = true
anyhow.workspace = true
chrono.workspace = true
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true
regex.workspace = true

View File

@@ -5,6 +5,9 @@ edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow.workspace = true
chrono = { workspace = true, features = ["serde"] }
rand.workspace = true
serde.workspace = true
serde_with.workspace = true
utils.workspace = true

View File

@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
use rand::Rng;
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
#[serde(tag = "type")]
pub enum EventType {
#[serde(rename = "absolute")]
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
// Just a wrapper around a slice of events
// to serialize it as `{"events" : [ ] }
#[derive(serde::Serialize, Deserialize)]
#[derive(serde::Serialize, serde::Deserialize)]
pub struct EventChunk<'a, T: Clone> {
pub events: std::borrow::Cow<'a, [T]>,
}

View File

@@ -12,4 +12,5 @@ bytes.workspace = true
utils.workspace = true
parking_lot.workspace = true
hex.workspace = true
scopeguard.workspace = true
smallvec = { workspace = true, features = ["write"] }

View File

@@ -64,7 +64,6 @@ pub struct ConfigToml {
#[serde(with = "humantime_serde")]
pub wal_redo_timeout: Duration,
pub superuser: String,
pub initdb_cache_dir: Option<Utf8PathBuf>,
pub page_cache_size: usize,
pub max_file_descriptors: usize,
pub pg_distrib_dir: Option<Utf8PathBuf>,
@@ -174,6 +173,40 @@ impl Default for EvictionOrder {
}
}
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
pub enum GetVectoredImpl {
Sequential,
Vectored,
}
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
pub enum GetImpl {
Legacy,
Vectored,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct MaxVectoredReadBytes(pub NonZeroUsize);
@@ -305,6 +338,8 @@ pub mod defaults {
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
ImageCompressionAlgorithm::Zstd { level: Some(1) };
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
@@ -323,7 +358,6 @@ impl Default for ConfigToml {
wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
.expect("cannot parse default wal redo timeout")),
superuser: (DEFAULT_SUPERUSER.to_string()),
initdb_cache_dir: None,
page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
@@ -342,10 +376,7 @@ impl Default for ConfigToml {
concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
.expect("Invalid default constant")),
concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(
DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES,
)
.unwrap(),
concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
metric_collection_interval: (humantime::parse_duration(
DEFAULT_METRIC_COLLECTION_INTERVAL,
)
@@ -436,6 +467,8 @@ pub mod tenant_conf_defaults {
// By default ingest enough WAL for two new L0 layers before checking if new image
// image layers should be created.
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
}
impl Default for TenantConfigToml {

View File

@@ -1,8 +1,8 @@
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::Oid;
use postgres_ffi::RepOriginId;
use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize};
use std::{fmt, ops::Range};
@@ -263,6 +263,15 @@ impl Key {
field5: u8::MAX,
field6: u32::MAX,
};
/// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
pub const NON_L0_MAX: Key = Key {
field1: u8::MAX,
field2: u32::MAX,
field3: u32::MAX,
field4: u32::MAX,
field5: u8::MAX,
field6: u32::MAX - 1,
};
pub fn from_hex(s: &str) -> Result<Self> {
if s.len() != 36 {
@@ -350,17 +359,7 @@ impl Key {
// 02 00000000 00000000 00000000 00 00000000
//
// TwoPhaseFile:
//
// 02 00000000 00000000 00XXXXXX XX XXXXXXXX
//
// \______XID_________/
//
// The 64-bit XID is stored a little awkwardly in field6, field5 and
// field4. PostgreSQL v16 and below only stored a 32-bit XID, which
// fit completely in field6, but starting with PostgreSQL v17, a full
// 64-bit XID is used. Most pageserver code that accesses
// TwoPhaseFiles now deals with 64-bit XIDs even on v16, the high bits
// are just unused.
// 02 00000000 00000000 00000000 00 XID
//
// ControlFile:
// 03 00000000 00000000 00000000 00 00000000
@@ -592,36 +591,35 @@ pub const TWOPHASEDIR_KEY: Key = Key {
};
#[inline(always)]
pub fn twophase_file_key(xid: u64) -> Key {
pub fn twophase_file_key(xid: TransactionId) -> Key {
Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32,
field5: ((xid & 0x000000FF00000000) >> 32) as u8,
field6: (xid & 0x00000000FFFFFFFF) as u32,
field4: 0,
field5: 0,
field6: xid,
}
}
#[inline(always)]
pub fn twophase_key_range(xid: u64) -> Range<Key> {
// 64-bit XIDs really should not overflow
pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
let (next_xid, overflowed) = xid.overflowing_add(1);
Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32,
field5: ((xid & 0x000000FF00000000) >> 32) as u8,
field6: (xid & 0x00000000FFFFFFFF) as u32,
field4: 0,
field5: 0,
field6: xid,
}..Key {
field1: 0x02,
field2: 0,
field3: u32::from(overflowed),
field4: ((next_xid & 0xFFFFFF0000000000) >> 40) as u32,
field5: ((next_xid & 0x000000FF00000000) >> 32) as u8,
field6: (next_xid & 0x00000000FFFFFFFF) as u32,
field3: 0,
field4: 0,
field5: u8::from(overflowed),
field6: next_xid,
}
}

View File

@@ -62,7 +62,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
serde::Serialize,
serde::Deserialize,
strum_macros::Display,
strum_macros::VariantNames,
strum_macros::EnumVariantNames,
strum_macros::AsRefStr,
strum_macros::IntoStaticStr,
)]
@@ -495,7 +495,7 @@ pub struct CompactionAlgorithmSettings {
pub kind: CompactionAlgorithm,
}
#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
pub enum L0FlushConfig {
#[serde(rename_all = "snake_case")]

View File

@@ -89,19 +89,8 @@ impl PageserverUtilization {
/// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
/// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
///
/// When a node is overloaded, we may override soft affinity preferences and do things like scheduling
/// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded.
pub fn is_overloaded(score: RawScore) -> bool {
// Why the factor of two? This is unscientific but reflects behavior of real systems:
// - In terms of shard counts, a node's preferred max count is a soft limit intended to keep
// startup and housekeeping jobs nice and responsive. We can go to double this limit if needed
// until some more nodes are deployed.
// - In terms of disk space, the node's utilization heuristic assumes every tenant needs to
// hold its biggest timeline fully on disk, which is tends to be an over estimate when
// some tenants are very idle and have dropped layers from disk. In practice going up to
// double is generally better than giving up and scheduling in a sub-optimal AZ.
score >= 2 * Self::UTILIZATION_FULL
score >= Self::UTILIZATION_FULL
}
pub fn adjust_shard_count_max(&mut self, shard_count: u32) {

View File

@@ -5,8 +5,10 @@ edition.workspace = true
license.workspace = true
[dependencies]
async-trait.workspace = true
anyhow.workspace = true
bytes.workspace = true
futures.workspace = true
rustls.workspace = true
serde.workspace = true
thiserror.workspace = true

View File

@@ -81,16 +81,17 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
)
}
#[async_trait::async_trait]
pub trait Handler<IO> {
/// Handle single query.
/// postgres_backend will issue ReadyForQuery after calling this (this
/// might be not what we want after CopyData streaming, but currently we don't
/// care). It will also flush out the output buffer.
fn process_query(
async fn process_query(
&mut self,
pgb: &mut PostgresBackend<IO>,
query_string: &str,
) -> impl Future<Output = Result<(), QueryError>>;
) -> Result<(), QueryError>;
/// Called on startup packet receival, allows to process params.
///
@@ -280,6 +281,16 @@ pub struct PostgresBackend<IO> {
pub type PostgresBackendTCP = PostgresBackend<tokio::net::TcpStream>;
pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
let mut query_string = query_string.to_vec();
if let Some(ch) = query_string.last() {
if *ch == 0 {
query_string.pop();
}
}
query_string
}
/// Cast a byte slice to a string slice, dropping null terminator if there's one.
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);

View File

@@ -23,6 +23,7 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) {
struct TestHandler {}
#[async_trait::async_trait]
impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
// return single col 'hey' for any query
async fn process_query(

View File

@@ -5,10 +5,13 @@ edition.workspace = true
license.workspace = true
[dependencies]
rand.workspace = true
regex.workspace = true
bytes.workspace = true
byteorder.workspace = true
anyhow.workspace = true
crc32c.workspace = true
hex.workspace = true
once_cell.workspace = true
log.workspace = true
memoffset.workspace = true

View File

@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
PathBuf::from("pg_install")
};
for pg_version in &["v14", "v15", "v16", "v17"] {
for pg_version in &["v14", "v15", "v16"] {
let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
if pg_install_dir_versioned.is_relative() {
let cwd = env::current_dir().context("Failed to get current_dir")?;
@@ -121,7 +121,6 @@ fn main() -> anyhow::Result<()> {
.allowlist_type("XLogPageHeaderData")
.allowlist_type("XLogLongPageHeaderData")
.allowlist_var("XLOG_PAGE_MAGIC")
.allowlist_var("PG_MAJORVERSION_NUM")
.allowlist_var("PG_CONTROL_FILE_SIZE")
.allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
.allowlist_type("PageHeaderData")

View File

@@ -44,9 +44,6 @@ macro_rules! postgres_ffi {
// Re-export some symbols from bindings
pub use bindings::DBState_DB_SHUTDOWNED;
pub use bindings::{CheckPoint, ControlFileData, XLogRecord};
pub const ZERO_CHECKPOINT: bytes::Bytes =
bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]);
}
};
}
@@ -57,7 +54,6 @@ macro_rules! for_all_postgres_versions {
$macro!(v14);
$macro!(v15);
$macro!(v16);
$macro!(v17);
};
}
@@ -92,7 +88,6 @@ macro_rules! dispatch_pgversion {
14 : v14,
15 : v15,
16 : v16,
17 : v17,
]
)
};
@@ -111,110 +106,6 @@ macro_rules! dispatch_pgversion {
};
}
#[macro_export]
macro_rules! enum_pgversion_dispatch {
($name:expr, $typ:ident, $bind:ident, $code:block) => {
enum_pgversion_dispatch!(
name = $name,
bind = $bind,
typ = $typ,
code = $code,
pgversions = [
V14 : v14,
V15 : v15,
V16 : v16,
V17 : v17,
]
)
};
(name = $name:expr,
bind = $bind:ident,
typ = $typ:ident,
code = $code:block,
pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => {
match $name {
$(
self::$typ::$variant($bind) => {
use $crate::$md as pgv;
$code
}
),+,
}
};
}
#[macro_export]
macro_rules! enum_pgversion {
{$name:ident, pgv :: $t:ident} => {
enum_pgversion!{
name = $name,
typ = $t,
pgversions = [
V14 : v14,
V15 : v15,
V16 : v16,
V17 : v17,
]
}
};
{$name:ident, pgv :: $p:ident :: $t:ident} => {
enum_pgversion!{
name = $name,
path = $p,
typ = $t,
pgversions = [
V14 : v14,
V15 : v15,
V16 : v16,
V17 : v17,
]
}
};
{name = $name:ident,
typ = $t:ident,
pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
pub enum $name {
$($variant ( $crate::$md::$t )),+
}
impl self::$name {
pub fn pg_version(&self) -> u32 {
enum_pgversion_dispatch!(self, $name, _ign, {
pgv::bindings::PG_MAJORVERSION_NUM
})
}
}
$(
impl Into<self::$name> for $crate::$md::$t {
fn into(self) -> self::$name {
self::$name::$variant (self)
}
}
)+
};
{name = $name:ident,
path = $p:ident,
typ = $t:ident,
pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
pub enum $name {
$($variant ($crate::$md::$p::$t)),+
}
impl $name {
pub fn pg_version(&self) -> u32 {
enum_pgversion_dispatch!(self, $name, _ign, {
pgv::bindings::PG_MAJORVERSION_NUM
})
}
}
$(
impl Into<$name> for $crate::$md::$p::$t {
fn into(self) -> $name {
$name::$variant (self)
}
}
)+
};
}
pub mod pg_constants;
pub mod relfile_utils;

View File

@@ -9,8 +9,8 @@
//! comments on them.
//!
use crate::PageHeaderData;
use crate::BLCKSZ;
use crate::{PageHeaderData, XLogRecord};
//
// From pg_tablespace_d.h
@@ -152,9 +152,6 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
// From heapam_xlog.h
pub const XLOG_HEAP2_REWRITE: u8 = 0x00;
// From replication/message.h
pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
@@ -194,6 +191,8 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
//
// from xlogrecord.h
//
@@ -217,21 +216,18 @@ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
/* From transam.h */
pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
pub const INVALID_TRANSACTION_ID: u32 = 0;
pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
/* pg_control.h */
pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
pub const XLOG_PARAMETER_CHANGE: u8 = 0x60;
pub const XLOG_END_OF_RECOVERY: u8 = 0x90;
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
pub const XLP_LONG_HEADER: u16 = 0x0002;
/* From xlog.h */
pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;
/* xlog_internal.h */
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
pub const XLP_LONG_HEADER: u16 = 0x0002;
/* From replication/slot.h */
pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */
+ 64 /* NameData */ + 4*4;
@@ -249,6 +245,33 @@ pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
/* From origin.c */
pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
// List of subdirectories inside pgdata.
// Copied from src/bin/initdb/initdb.c
pub const PGDATA_SUBDIRS: [&str; 22] = [
"global",
"pg_wal/archive_status",
"pg_commit_ts",
"pg_dynshmem",
"pg_notify",
"pg_serial",
"pg_snapshots",
"pg_subtrans",
"pg_twophase",
"pg_multixact",
"pg_multixact/members",
"pg_multixact/offsets",
"base",
"base/1",
"pg_replslot",
"pg_tblspc",
"pg_stat",
"pg_stat_tmp",
"pg_xact",
"pg_logical",
"pg_logical/snapshots",
"pg_logical/mappings",
];
// Don't include postgresql.conf as it is inconvenient on node start:
// we need postgresql.conf before basebackup to synchronize safekeepers
// so no point in overwriting it during backup restore. Rest of the files

View File

@@ -5,33 +5,6 @@ pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
// List of subdirectories inside pgdata.
// Copied from src/bin/initdb/initdb.c
pub const PGDATA_SUBDIRS: [&str; 22] = [
"global",
"pg_wal/archive_status",
"pg_commit_ts",
"pg_dynshmem",
"pg_notify",
"pg_serial",
"pg_snapshots",
"pg_subtrans",
"pg_twophase",
"pg_multixact",
"pg_multixact/members",
"pg_multixact/offsets",
"base",
"base/1",
"pg_replslot",
"pg_tblspc",
"pg_stat",
"pg_stat_tmp",
"pg_xact",
"pg_logical",
"pg_logical/snapshots",
"pg_logical/mappings",
];
pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
(bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
}

View File

@@ -11,8 +11,6 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
pub use super::super::v14::bindings::PGDATA_SUBDIRS;
pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;

View File

@@ -11,8 +11,6 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
pub use super::super::v14::bindings::PGDATA_SUBDIRS;
pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;

View File

@@ -1,55 +0,0 @@
pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
pub const XLOG_DBASE_DROP: u8 = 0x20;
pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
// List of subdirectories inside pgdata.
// Copied from src/bin/initdb/initdb.c
pub const PGDATA_SUBDIRS: [&str; 23] = [
"global",
"pg_wal/archive_status",
"pg_wal/summaries",
"pg_commit_ts",
"pg_dynshmem",
"pg_notify",
"pg_serial",
"pg_snapshots",
"pg_subtrans",
"pg_twophase",
"pg_multixact",
"pg_multixact/members",
"pg_multixact/offsets",
"base",
"base/1",
"pg_replslot",
"pg_tblspc",
"pg_stat",
"pg_stat_tmp",
"pg_xact",
"pg_logical",
"pg_logical/snapshots",
"pg_logical/mappings",
];
pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
(bimg_info & ANY_COMPRESS_FLAG) != 0
}
pub const XLOG_HEAP2_PRUNE_ON_ACCESS: u8 = 0x10;
pub const XLOG_HEAP2_PRUNE_VACUUM_SCAN: u8 = 0x20;
pub const XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: u8 = 0x30;
pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0;
pub const XLOG_CHECKPOINT_REDO: u8 = 0xE0;

View File

@@ -26,12 +26,11 @@ use bytes::{Buf, Bytes};
use log::*;
use serde::Serialize;
use std::ffi::OsStr;
use std::fs::File;
use std::io::prelude::*;
use std::io::ErrorKind;
use std::io::SeekFrom;
use std::path::Path;
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use utils::bin_ser::DeserializeError;
use utils::bin_ser::SerializeError;
@@ -79,34 +78,19 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize
)
}
pub fn XLogFromFileName(
fname: &OsStr,
wal_seg_size: usize,
) -> anyhow::Result<(XLogSegNo, TimeLineID)> {
if let Some(fname_str) = fname.to_str() {
let tli = u32::from_str_radix(&fname_str[0..8], 16)?;
let log = u32::from_str_radix(&fname_str[8..16], 16)? as XLogSegNo;
let seg = u32::from_str_radix(&fname_str[16..24], 16)? as XLogSegNo;
Ok((log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli))
} else {
anyhow::bail!("non-ut8 filename: {:?}", fname);
}
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
(log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
}
pub fn IsXLogFileName(fname: &OsStr) -> bool {
if let Some(fname) = fname.to_str() {
fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit())
} else {
false
}
pub fn IsXLogFileName(fname: &str) -> bool {
return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
}
pub fn IsPartialXLogFileName(fname: &OsStr) -> bool {
if let Some(fname) = fname.to_str() {
fname.ends_with(".partial") && IsXLogFileName(OsStr::new(&fname[0..fname.len() - 8]))
} else {
false
}
pub fn IsPartialXLogFileName(fname: &str) -> bool {
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
}
/// If LSN points to the beginning of the page, then shift it to first record,
@@ -276,6 +260,13 @@ fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result<Option<File>> {
}
}
pub fn main() {
let mut data_dir = PathBuf::new();
data_dir.push(".");
let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap();
println!("wal_end={:?}", wal_end);
}
impl XLogRecord {
pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
use utils::bin_ser::LeSer;

View File

@@ -9,6 +9,7 @@ anyhow.workspace = true
clap.workspace = true
env_logger.workspace = true
log.workspace = true
once_cell.workspace = true
postgres.workspace = true
postgres_ffi.workspace = true
camino-tempfile.workspace = true

View File

@@ -7,7 +7,6 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
use postgres_ffi::{
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
};
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::{Duration, Instant};
@@ -27,6 +26,7 @@ macro_rules! xlog_utils_test {
postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Conf {
pub pg_version: u32,
pub pg_distrib_dir: PathBuf,
@@ -53,7 +53,7 @@ impl Conf {
#[allow(clippy::manual_range_patterns)]
match self.pg_version {
14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))),
14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))),
_ => bail!("Unsupported postgres version: {}", self.pg_version),
}
}
@@ -136,8 +136,8 @@ impl Conf {
pub fn pg_waldump(
&self,
first_segment_name: &OsStr,
last_segment_name: &OsStr,
first_segment_name: &str,
last_segment_name: &str,
) -> anyhow::Result<std::process::Output> {
let first_segment_file = self.datadir.join(first_segment_name);
let last_segment_file = self.datadir.join(last_segment_name);

View File

@@ -4,7 +4,6 @@ use super::*;
use crate::{error, info};
use regex::Regex;
use std::cmp::min;
use std::ffi::OsStr;
use std::fs::{self, File};
use std::io::Write;
use std::{env, str::FromStr};
@@ -55,7 +54,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
.wal_dir()
.read_dir()
.unwrap()
.map(|f| f.unwrap().file_name())
.map(|f| f.unwrap().file_name().into_string().unwrap())
.filter(|fname| IsXLogFileName(fname))
.max()
.unwrap();
@@ -71,11 +70,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
start_lsn
);
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
let fname = file.file_name();
let fname = file.file_name().into_string().unwrap();
if !IsXLogFileName(&fname) {
continue;
}
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE).unwrap();
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
if seg_start_lsn > u64::from(*start_lsn) {
continue;
@@ -94,10 +93,10 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
}
}
fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn {
fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
// Get the actual end of WAL by pg_waldump
let waldump_output = cfg
.pg_waldump(OsStr::new("000000010000000000000001"), last_segment)
.pg_waldump("000000010000000000000001", last_segment)
.unwrap()
.stderr;
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
@@ -118,7 +117,7 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn {
fn check_end_of_wal(
cfg: &crate::Conf,
last_segment: &OsStr,
last_segment: &str,
start_lsn: Lsn,
expected_end_of_wal: Lsn,
) {
@@ -133,8 +132,7 @@ fn check_end_of_wal(
// Rename file to partial to actually find last valid lsn, then rename it back.
fs::rename(
cfg.wal_dir().join(last_segment),
cfg.wal_dir()
.join(format!("{}.partial", last_segment.to_str().unwrap())),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
)
.unwrap();
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
@@ -144,8 +142,7 @@ fn check_end_of_wal(
);
assert_eq!(wal_end, expected_end_of_wal);
fs::rename(
cfg.wal_dir()
.join(format!("{}.partial", last_segment.to_str().unwrap())),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
cfg.wal_dir().join(last_segment),
)
.unwrap();

View File

@@ -8,8 +8,10 @@ license.workspace = true
bytes.workspace = true
byteorder.workspace = true
itertools.workspace = true
pin-project-lite.workspace = true
postgres-protocol.workspace = true
rand.workspace = true
tokio = { workspace = true, features = ["io-util"] }
tracing.workspace = true
thiserror.workspace = true
serde.workspace = true

View File

@@ -13,11 +13,14 @@ aws-smithy-async.workspace = true
aws-smithy-types.workspace = true
aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-credential-types.workspace = true
bytes.workspace = true
camino = { workspace = true, features = ["serde1"] }
humantime.workspace = true
humantime-serde.workspace = true
hyper = { workspace = true, features = ["stream"] }
futures.workspace = true
rand.workspace = true
serde.workspace = true
serde_json.workspace = true
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }

View File

@@ -185,7 +185,7 @@ mod tests {
use super::*;
fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
let toml = input.parse::<toml_edit::DocumentMut>().unwrap();
let toml = input.parse::<toml_edit::Document>().unwrap();
RemoteStorageConfig::from_toml(toml.as_item())
}

View File

@@ -127,6 +127,10 @@ impl RemotePath {
&self.0
}
pub fn extension(&self) -> Option<&str> {
self.0.extension()
}
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
self.0.strip_prefix(&p.0)
}

View File

@@ -6,5 +6,6 @@ license.workspace = true
[dependencies]
serde.workspace = true
serde_with.workspace = true
const_format.workspace = true
utils.workspace = true

View File

@@ -9,9 +9,8 @@ hyper.workspace = true
opentelemetry = { workspace = true, features=["rt-tokio"] }
opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions.workspace = true
reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tracing.workspace = true
tracing-opentelemetry.workspace = true
[dev-dependencies]
tracing-subscriber.workspace = true # For examples in docs
tracing-subscriber.workspace = true

View File

@@ -42,6 +42,7 @@ tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
rand.workspace = true
serde_with.workspace = true
strum.workspace = true
strum_macros.workspace = true
url.workspace = true

33
libs/utils/src/accum.rs Normal file
View File

@@ -0,0 +1,33 @@
/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you
/// feed the accumulated values by calling the 'accum' function, instead of having an
/// iterator.
///
/// For example, to calculate the smallest value among some integers:
///
/// ```
/// use utils::accum::Accum;
///
/// let values = [1, 2, 3];
///
/// let mut min_value: Accum<u32> = Accum(None);
/// for new_value in &values {
/// min_value.accum(std::cmp::min, *new_value);
/// }
///
/// assert_eq!(min_value.0.unwrap(), 1);
/// ```
pub struct Accum<T>(pub Option<T>);
impl<T: Copy> Accum<T> {
pub fn accum<F>(&mut self, func: F, new_value: T)
where
F: FnOnce(T, T) -> T,
{
// If there is no previous value, just store the new value.
// Otherwise call the function to decide which one to keep.
self.0 = Some(if let Some(accum) = self.0 {
func(accum, new_value)
} else {
new_value
});
}
}

View File

@@ -82,7 +82,7 @@ impl ApiError {
StatusCode::INTERNAL_SERVER_ERROR,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
),
}

View File

@@ -88,6 +88,12 @@ impl<'de> Deserialize<'de> for Id {
}
impl Id {
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
let mut arr = [0u8; 16];
buf.copy_to_slice(&mut arr);
Id::from(arr)
}
pub fn from_slice(src: &[u8]) -> Result<Id, IdError> {
if src.len() != 16 {
return Err(IdError::SliceParseError(src.len()));
@@ -173,6 +179,10 @@ impl fmt::Debug for Id {
macro_rules! id_newtype {
($t:ident) => {
impl $t {
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
$t(Id::get_from_buf(buf))
}
pub fn from_slice(src: &[u8]) -> Result<$t, IdError> {
Ok($t(Id::from_slice(src)?))
}

View File

@@ -21,13 +21,7 @@
//!
//! Another explaination can be found here: <https://brandur.org/rate-limiting>
use std::{
sync::{
atomic::{AtomicU64, Ordering},
Mutex,
},
time::Duration,
};
use std::{sync::Mutex, time::Duration};
use tokio::{sync::Notify, time::Instant};
@@ -134,7 +128,6 @@ impl LeakyBucketState {
pub struct RateLimiter {
pub config: LeakyBucketConfig,
pub sleep_counter: AtomicU64,
pub state: Mutex<LeakyBucketState>,
/// a queue to provide this fair ordering.
pub queue: Notify,
@@ -151,7 +144,6 @@ impl Drop for Requeue<'_> {
impl RateLimiter {
pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
RateLimiter {
sleep_counter: AtomicU64::new(0),
state: Mutex::new(LeakyBucketState::with_initial_tokens(
&config,
initial_tokens,
@@ -171,16 +163,15 @@ impl RateLimiter {
/// returns true if we did throttle
pub async fn acquire(&self, count: usize) -> bool {
let start = tokio::time::Instant::now();
let mut throttled = false;
let start_count = self.sleep_counter.load(Ordering::Acquire);
let mut end_count = start_count;
let start = tokio::time::Instant::now();
// wait until we are the first in the queue
let mut notified = std::pin::pin!(self.queue.notified());
if !notified.as_mut().enable() {
throttled = true;
notified.await;
end_count = self.sleep_counter.load(Ordering::Acquire);
}
// notify the next waiter in the queue when we are done.
@@ -193,22 +184,9 @@ impl RateLimiter {
.unwrap()
.add_tokens(&self.config, start, count as f64);
match res {
Ok(()) => return end_count > start_count,
Ok(()) => return throttled,
Err(ready_at) => {
struct Increment<'a>(&'a AtomicU64);
impl Drop for Increment<'_> {
fn drop(&mut self) {
self.0.fetch_add(1, Ordering::AcqRel);
}
}
// increment the counter after we finish sleeping (or cancel this task).
// this ensures that tasks that have already started the acquire will observe
// the new sleep count when they are allowed to resume on the notify.
let _inc = Increment(&self.sleep_counter);
end_count += 1;
throttled = true;
tokio::time::sleep_until(ready_at).await;
}
}

View File

@@ -43,9 +43,16 @@ pub mod logging;
pub mod lock_file;
pub mod pid_file;
// Misc
pub mod accum;
pub mod shutdown;
// Utility for binding TcpListeners with proper socket options.
pub mod tcp_listener;
// Utility for putting a raw file descriptor into non-blocking mode
pub mod nonblock;
// Default signal handling
pub mod sentry_init;
pub mod signals;

View File

@@ -3,9 +3,11 @@ use std::str::FromStr;
use anyhow::Context;
use metrics::{IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use strum_macros::{EnumString, VariantNames};
use strum_macros::{EnumString, EnumVariantNames};
#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
#[derive(
EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy,
)]
#[strum(serialize_all = "snake_case")]
pub enum LogFormat {
Plain,

View File

@@ -1,5 +1,6 @@
#![warn(missing_docs)]
use camino::Utf8Path;
use serde::{de::Visitor, Deserialize, Serialize};
use std::fmt;
use std::ops::{Add, AddAssign};
@@ -144,6 +145,14 @@ impl Lsn {
i128::from(self.0) - i128::from(other)
}
/// Parse an LSN from a filename in the form `0000000000000000`
pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
where
F: AsRef<Utf8Path>,
{
Lsn::from_hex(filename.as_ref().as_str())
}
/// Parse an LSN from a string in the form `0000000000000000`
pub fn from_hex<S>(s: S) -> Result<Self, LsnParseError>
where

View File

@@ -0,0 +1,17 @@
use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL};
use std::os::unix::io::RawFd;
/// Put a file descriptor into non-blocking mode
pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
let bits = fcntl(fd, F_GETFL)?;
// If F_GETFL returns some unknown bits, they should be valid
// for passing back to F_SETFL, too. If we left them out, the F_SETFL
// would effectively clear them, which is not what we want.
let mut flags = OFlag::from_bits_retain(bits);
flags |= OFlag::O_NONBLOCK;
fcntl(fd, F_SETFL(flags))?;
Ok(())
}

View File

@@ -0,0 +1,7 @@
/// Immediately terminate the calling process without calling
/// atexit callbacks, C runtime destructors etc. We mainly use
/// this to protect coverage data from concurrent writes.
pub fn exit_now(code: u8) -> ! {
// SAFETY: exiting is safe, the ffi is not safe
unsafe { nix::libc::_exit(code as _) };
}

View File

@@ -10,7 +10,7 @@ pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
where
T: serde::de::DeserializeOwned,
{
let document: toml_edit::DocumentMut = match item {
let document: toml_edit::Document = match item {
toml_edit::Item::Table(toml) => toml.clone().into(),
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
toml.clone().into_table().into()

View File

@@ -120,6 +120,32 @@ impl<K: Ord, V> VecMap<K, V> {
Ok((None, delta_size))
}
/// Split the map into two.
///
/// The left map contains everything before `cutoff` (exclusive).
/// Right map contains `cutoff` and everything after (inclusive).
pub fn split_at(&self, cutoff: &K) -> (Self, Self)
where
K: Clone,
V: Clone,
{
let split_idx = self
.data
.binary_search_by_key(&cutoff, extract_key)
.unwrap_or_else(std::convert::identity);
(
VecMap {
data: self.data[..split_idx].to_vec(),
ordering: self.ordering,
},
VecMap {
data: self.data[split_idx..].to_vec(),
ordering: self.ordering,
},
)
}
/// Move items from `other` to the end of `self`, leaving `other` empty.
/// If the `other` ordering is different from `self` ordering
/// `ExtendOrderingError` error will be returned.

View File

@@ -15,11 +15,13 @@ anyhow.workspace = true
axum.workspace = true
clap.workspace = true
futures.workspace = true
inotify.workspace = true
serde.workspace = true
serde_json.workspace = true
sysinfo.workspace = true
tokio = { workspace = true, features = ["rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-subscriber.workspace = true

View File

@@ -5,8 +5,6 @@ use std::{env, path::PathBuf, process::Command};
use anyhow::{anyhow, Context};
const WALPROPOSER_PG_VERSION: &str = "v17";
fn main() -> anyhow::Result<()> {
// Tell cargo to invalidate the built crate whenever the wrapper changes
println!("cargo:rerun-if-changed=bindgen_deps.h");
@@ -38,10 +36,7 @@ fn main() -> anyhow::Result<()> {
// Rebuild crate when libwalproposer.a changes
println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a");
let pg_config_bin = pg_install_abs
.join(WALPROPOSER_PG_VERSION)
.join("bin")
.join("pg_config");
let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
let inc_server_path: String = if pg_config_bin.exists() {
let output = Command::new(pg_config_bin)
.arg("--includedir-server")
@@ -58,7 +53,7 @@ fn main() -> anyhow::Result<()> {
.into()
} else {
let server_path = pg_install_abs
.join(WALPROPOSER_PG_VERSION)
.join("v16")
.join("include")
.join("postgresql")
.join("server")

View File

@@ -15,6 +15,7 @@ anyhow.workspace = true
arc-swap.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
bit_field.workspace = true
byteorder.workspace = true
bytes.workspace = true
@@ -22,9 +23,12 @@ camino.workspace = true
camino-tempfile.workspace = true
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["string"] }
const_format.workspace = true
consumption_metrics.workspace = true
crc32c.workspace = true
crossbeam-utils.workspace = true
either.workspace = true
flate2.workspace = true
fail.workspace = true
futures.workspace = true
git-version.workspace = true
@@ -53,6 +57,10 @@ serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
serde_path_to_error.workspace = true
serde_with.workspace = true
signal-hook.workspace = true
smallvec = { workspace = true, features = ["write"] }
svg_fmt.workspace = true
sync_wrapper.workspace = true
sysinfo.workspace = true
tokio-tar.workspace = true
thiserror.workspace = true
@@ -65,6 +73,7 @@ tokio-stream.workspace = true
tokio-util.workspace = true
toml_edit = { workspace = true, features = [ "serde" ] }
tracing.workspace = true
twox-hash.workspace = true
url.workspace = true
walkdir.workspace = true
metrics.workspace = true

View File

@@ -1,6 +1,6 @@
use std::pin::Pin;
use std::{pin::Pin, sync::Arc};
use futures::SinkExt;
use futures::{SinkExt, StreamExt};
use pageserver_api::{
models::{
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
@@ -10,7 +10,6 @@ use pageserver_api::{
};
use tokio::task::JoinHandle;
use tokio_postgres::CopyOutStream;
use tokio_stream::StreamExt;
use tokio_util::sync::CancellationToken;
use utils::{
id::{TenantId, TimelineId},
@@ -136,18 +135,68 @@ impl PagestreamClient {
drop(copy_both);
}
pub async fn getpage(
&mut self,
req: PagestreamGetPageRequest,
) -> anyhow::Result<PagestreamGetPageResponse> {
pub fn split(self) -> (PagestreamTx, PagestreamRx) {
let Self {
copy_both,
cancel_on_client_drop,
conn_task,
} = self;
let keep_client_alive = KeepClientAlive {
client: conn_task,
cancel_on_client_drop: cancel_on_client_drop.unwrap(),
};
let keep_client_alive = Arc::new(keep_client_alive);
let (sink, stream): (
futures::stream::SplitSink<
Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
bytes::Bytes,
>,
futures::stream::SplitStream<Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>>,
) = copy_both.split();
(
PagestreamTx {
sink,
keep_client_alive: keep_client_alive.clone(),
},
PagestreamRx {
stream,
keep_client_alive,
},
)
}
}
struct KeepClientAlive {
client: JoinHandle<()>,
cancel_on_client_drop: tokio_util::sync::DropGuard,
}
pub struct PagestreamTx {
sink: futures::stream::SplitSink<
Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
bytes::Bytes,
>,
keep_client_alive: Arc<KeepClientAlive>,
}
pub struct PagestreamRx {
stream: futures::stream::SplitStream<Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>>,
keep_client_alive: Arc<KeepClientAlive>,
}
impl PagestreamTx {
pub async fn send_getpage(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
let req = PagestreamFeMessage::GetPage(req);
let req: bytes::Bytes = req.serialize();
// let mut req = tokio_util::io::ReaderStream::new(&req);
let mut req = tokio_stream::once(Ok(req));
let mut req = tokio_stream::once(Ok(req.clone()));
self.sink.send_all(&mut req).await?;
Ok(())
}
}
self.copy_both.send_all(&mut req).await?;
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
impl PagestreamRx {
pub async fn recv_getpage(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
let next: bytes::Bytes = next.unwrap()?;
let msg = PagestreamBeMessage::deserialize(next)?;

View File

@@ -9,19 +9,41 @@ default = []
[dependencies]
anyhow.workspace = true
async-compression.workspace = true
async-stream.workspace = true
byteorder.workspace = true
bytes.workspace = true
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["string"] }
const_format.workspace = true
consumption_metrics.workspace = true
crossbeam-utils.workspace = true
either.workspace = true
flate2.workspace = true
fail.workspace = true
futures.workspace = true
git-version.workspace = true
hex.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
itertools.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
pin-project-lite.workspace = true
rand.workspace = true
smallvec = { workspace = true, features = ["write"] }
svg_fmt.workspace = true
sync_wrapper.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
tokio-io-timeout.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber.workspace = true
url.workspace = true
walkdir.workspace = true
metrics.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -8,6 +8,7 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
bytes.workspace = true
camino.workspace = true
clap = { workspace = true, features = ["string"] }
git-version.workspace = true
@@ -23,4 +24,5 @@ toml_edit.workspace = true
utils.workspace = true
svg_fmt.workspace = true
workspace_hack.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -108,6 +108,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
enum LineKind {
GcCutoff,
Branch,
KeyVertical,
}
impl From<LineKind> for Fill {
@@ -115,6 +116,7 @@ impl From<LineKind> for Fill {
match value {
LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)),
LineKind::Branch => Fill::Color(rgb(0, 255, 0)),
LineKind::KeyVertical => Fill::Color(rgb(0, 0, 255)),
}
}
}
@@ -126,6 +128,7 @@ impl FromStr for LineKind {
Ok(match s {
"gc_cutoff" => LineKind::GcCutoff,
"branch" => LineKind::Branch,
"key" => LineKind::KeyVertical,
_ => anyhow::bail!("unsupported linekind: {s}"),
})
}
@@ -142,25 +145,31 @@ pub fn main() -> Result<()> {
let stdin = io::stdin();
let mut lines: Vec<(Lsn, LineKind)> = vec![];
let mut vertical_lines: Vec<(Key, LineKind)> = vec![];
for (lineno, line) in stdin.lock().lines().enumerate() {
let lineno = lineno + 1;
let line = line.unwrap();
if let Some((kind, lsn)) = line.split_once(':') {
let (kind, lsn) = LineKind::from_str(kind)
.context("parse kind")
.and_then(|kind| {
if lsn.contains('/') {
Lsn::from_str(lsn)
} else {
Lsn::from_hex(lsn)
if let Some((kind, what)) = line.split_once(':') {
(|| {
match LineKind::from_str(kind).context("parse kind")? {
kind @ LineKind::Branch | kind @ LineKind::GcCutoff => {
let lsn = if what.contains('/') {
Lsn::from_str(what)?
} else {
Lsn::from_hex(what)?
};
lines.push((lsn, kind));
}
.map(|lsn| (kind, lsn))
.context("parse lsn")
})
.with_context(|| format!("parse {line:?} on {lineno}"))?;
lines.push((lsn, kind));
kind @ LineKind::KeyVertical => {
let key = Key::from_hex(what).context("parse key")?;
vertical_lines.push((key, kind));
}
}
anyhow::Ok(())
})()
.with_context(|| format!("parse {line:?} on {lineno}"))?;
continue;
}
let line = PathBuf::from_str(&line).unwrap();
@@ -175,7 +184,7 @@ pub fn main() -> Result<()> {
}
// Collect all coordinates
let mut keys: Vec<Key> = Vec::with_capacity(files.len());
let mut keys: Vec<Key> = Vec::with_capacity(files.len() + vertical_lines.len());
let mut lsns: Vec<Lsn> = Vec::with_capacity(files.len() + lines.len());
for Layer {
@@ -192,6 +201,8 @@ pub fn main() -> Result<()> {
lsns.extend(lines.iter().map(|(lsn, _)| *lsn));
keys.extend(vertical_lines.iter().map(|(key, _)| *key));
// Analyze
let key_map = build_coordinate_compression_map(keys);
let lsn_map = build_coordinate_compression_map(lsns);
@@ -283,6 +294,25 @@ pub fn main() -> Result<()> {
);
}
for (key, kind) in vertical_lines {
let key = *key_map.get(&key).unwrap();
let stretch = 2.0;
let xmargin = 0.05;
let ymargin = 0.05;
let lsn_diff = 0.3;
let lsn_offset = -lsn_diff / 2.0;
println!(
"{}",
rectangle(
5.0 + key as f32,
0.0,
(key_map.len() + 10) as f32,
lsn_map.len() as f32,
)
.fill(kind)
);
}
println!("{}", EndSvg);
eprintln!("num_images: {}", num_images);

View File

@@ -79,24 +79,16 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
return None;
}
let keys: Vec<&str> = split[0].split('-').collect();
let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect();
let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect();
let the_lsns: [&str; 2];
/*
* Generations add a -vX-XXXXXX postfix, which causes issues when we try to
* parse 'vX' as an LSN.
*/
let is_delta = if lsns.len() == 1 || lsns[1].is_empty() {
the_lsns = [lsns[0], lsns[0]];
let mut lsns: Vec<&str> = split[1].split('-').collect();
let is_delta = if lsns.len() == 1 {
lsns.push(lsns[0]);
false
} else {
the_lsns = [lsns[0], lsns[1]];
true
};
let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap();
let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
let holes = Vec::new();
Some(LayerFile {
key_range,

View File

@@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> {
println!("specified prefix '{}' failed validation", cmd.prefix);
return Ok(());
};
let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?;
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
let toml_item = toml_document
.get("remote_storage")
.expect("need remote_storage");

View File

@@ -13,7 +13,7 @@ use rand::prelude::*;
use tokio::task::JoinSet;
use tracing::info;
use std::collections::HashSet;
use std::collections::{HashSet, VecDeque};
use std::future::Future;
use std::num::NonZeroUsize;
use std::pin::Pin;
@@ -295,64 +295,58 @@ async fn main_impl(
.await
.unwrap();
let (mut pagestream_tx, mut pagestream_rx) = client.split();
start_work_barrier.wait().await;
let client_start = Instant::now();
let mut ticks_processed = 0;
while !cancel.is_cancelled() {
// Detect if a request took longer than the RPS rate
if let Some(period) = &rps_period {
let periods_passed_until_now =
usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
let (rq_tx, mut rq_rx) = tokio::sync::mpsc::channel(4096);
let sender = tokio::spawn(async move {
while !cancel.is_cancelled() {
let start = Instant::now();
let req = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = Key::from_i128(key);
assert!(key.is_rel_block_key());
let (rel_tag, block_no) = key
.to_rel_block()
.expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX
} else {
r.timeline_lsn
},
not_modified_since: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
}
};
pagestream_tx.send_getpage(req).await.unwrap();
rq_tx.send(start).await.unwrap();
}
});
let receiver = tokio::spawn(async move {
while let Some(start) = rq_rx.recv().await {
let response = pagestream_rx.recv_getpage().await.unwrap();
let end = Instant::now();
live_stats.request_done();
STATS.with(|stats| {
stats
.borrow()
.lock()
.unwrap()
.observe(end.duration_since(start))
.unwrap();
if periods_passed_until_now > ticks_processed {
live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
}
ticks_processed = periods_passed_until_now;
});
}
});
let start = Instant::now();
let req = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = Key::from_i128(key);
assert!(key.is_rel_block_key());
let (rel_tag, block_no) = key
.to_rel_block()
.expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX
} else {
r.timeline_lsn
},
not_modified_since: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
}
};
client.getpage(req).await.unwrap();
let end = Instant::now();
live_stats.request_done();
ticks_processed += 1;
STATS.with(|stats| {
stats
.borrow()
.lock()
.unwrap()
.observe(end.duration_since(start))
.unwrap();
});
if let Some(period) = &rps_period {
let next_at = client_start
+ Duration::from_micros(
(ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
);
tokio::time::sleep_until(next_at.into()).await;
}
}
sender.await.unwrap();
receiver.await.unwrap();
})
};

View File

@@ -30,8 +30,9 @@ use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::dispatch_pgversion;
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA};
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
use postgres_ffi::TransactionId;
use postgres_ffi::XLogFileName;
use postgres_ffi::PG_TLI;
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
@@ -254,11 +255,8 @@ where
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
let pgversion = self.timeline.pg_version;
let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]);
// Create pgdata subdirs structure
for dir in subdirs.iter() {
for dir in PGDATA_SUBDIRS.iter() {
let header = new_tar_header_dir(dir)?;
self.ar
.append(&header, &mut io::empty())
@@ -608,7 +606,7 @@ where
//
// Extract twophase state files
//
async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> {
async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
let img = self
.timeline
.get_twophase_file(xid, self.lsn, self.ctx)
@@ -619,11 +617,7 @@ where
buf.extend_from_slice(&img[..]);
let crc = crc32c::crc32c(&img[..]);
buf.put_u32_le(crc);
let path = if self.timeline.pg_version < 17 {
format!("pg_twophase/{:>08X}", xid)
} else {
format!("pg_twophase/{:>016X}", xid)
};
let path = format!("pg_twophase/{:>08X}", xid);
let header = new_tar_header(&path, buf.len() as u64)?;
self.ar
.append(&header, &buf[..])

View File

@@ -13,6 +13,7 @@ use pageserver_api::{
use remote_storage::{RemotePath, RemoteStorageConfig};
use std::env;
use storage_broker::Uri;
use utils::crashsafe::path_with_suffix_extension;
use utils::logging::SecretString;
use once_cell::sync::OnceCell;
@@ -32,7 +33,7 @@ use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use crate::virtual_file;
use crate::virtual_file::io_engine;
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME};
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
/// Global state of pageserver.
///
@@ -70,8 +71,6 @@ pub struct PageServerConf {
pub superuser: String,
pub initdb_cache_dir: Option<Utf8PathBuf>,
pub page_cache_size: usize,
pub max_file_descriptors: usize,
@@ -258,6 +257,17 @@ impl PageServerConf {
.join(timeline_id.to_string())
}
pub(crate) fn timeline_delete_mark_file_path(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> Utf8PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_shard_id, &timeline_id),
TIMELINE_DELETE_MARK_SUFFIX,
)
}
/// Turns storage remote path of a file into its local path.
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
remote_path.with_base(&self.workdir)
@@ -271,7 +281,7 @@ impl PageServerConf {
#[allow(clippy::manual_range_patterns)]
match pg_version {
14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
_ => bail!("Unsupported postgres version: {}", pg_version),
}
}
@@ -299,7 +309,6 @@ impl PageServerConf {
wait_lsn_timeout,
wal_redo_timeout,
superuser,
initdb_cache_dir,
page_cache_size,
max_file_descriptors,
pg_distrib_dir,
@@ -347,7 +356,6 @@ impl PageServerConf {
wait_lsn_timeout,
wal_redo_timeout,
superuser,
initdb_cache_dir,
page_cache_size,
max_file_descriptors,
http_auth_type,
@@ -483,6 +491,11 @@ pub struct ConfigurableSemaphore {
}
impl ConfigurableSemaphore {
pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
Some(x) => x,
None => panic!("const unwrap is not yet stable"),
};
/// Initializse using a non-zero amount of permits.
///
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
@@ -503,6 +516,12 @@ impl ConfigurableSemaphore {
}
}
impl Default for ConfigurableSemaphore {
fn default() -> Self {
Self::new(Self::DEFAULT_INITIAL)
}
}
impl PartialEq for ConfigurableSemaphore {
fn eq(&self, other: &Self) -> bool {
// the number of permits can be increased at runtime, so we cannot really fulfill the

View File

@@ -178,7 +178,7 @@ async fn collect_metrics(
)
.await;
if let Err(e) = res {
tracing::error!("failed to upload to remote storage: {e:#}");
tracing::error!("failed to upload to S3: {e:#}");
}
}
};

View File

@@ -580,11 +580,9 @@ async fn import_file(
import_slru(modification, slru, file_path, reader, len, ctx).await?;
debug!("imported multixact members slru");
} else if file_path.starts_with("pg_twophase") {
let bytes = read_all_bytes(reader).await?;
let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
// In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions,
// it's a 32-bit TransactionId, which fits in u64 anyway.
let xid = u64::from_str_radix(file_name.as_ref(), 16)?;
let bytes = read_all_bytes(reader).await?;
modification
.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
.await?;

View File

@@ -9,7 +9,7 @@ use metrics::{
use once_cell::sync::Lazy;
use pageserver_api::shard::TenantShardId;
use strum::{EnumCount, VariantNames};
use strum_macros::{IntoStaticStr, VariantNames};
use strum_macros::{EnumVariantNames, IntoStaticStr};
use tracing::warn;
use utils::id::TimelineId;
@@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
];
// Metrics collected on operations on the storage repository.
#[derive(Debug, VariantNames, IntoStaticStr)]
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum StorageTimeOperation {
#[strum(serialize = "layer flush")]
@@ -1177,14 +1177,15 @@ pub(crate) mod virtual_file_io_engine {
}
struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
global_latency_histo: &'a Histogram,
global_metric: &'a Histogram,
// Optional because not all op types are tracked per-timeline
per_timeline_latency_histo: Option<&'a Histogram>,
timeline_metric: Option<&'a Histogram>,
ctx: &'c RequestContext,
start: std::time::Instant,
op: SmgrQueryType,
count: usize,
}
impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
@@ -1212,10 +1213,11 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
elapsed
}
};
self.global_latency_histo
.observe(ex_throttled.as_secs_f64());
if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
for _ in 0..self.count {
self.global_metric.observe(ex_throttled.as_secs_f64());
if let Some(timeline_metric) = self.timeline_metric {
timeline_metric.observe(ex_throttled.as_secs_f64());
}
}
}
}
@@ -1241,32 +1243,10 @@ pub enum SmgrQueryType {
#[derive(Debug)]
pub(crate) struct SmgrQueryTimePerTimeline {
global_started: [IntCounter; SmgrQueryType::COUNT],
global_latency: [Histogram; SmgrQueryType::COUNT],
per_timeline_getpage_started: IntCounter,
per_timeline_getpage_latency: Histogram,
global_metrics: [Histogram; SmgrQueryType::COUNT],
per_timeline_getpage: Histogram,
}
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
"pageserver_smgr_query_started_global_count",
"Number of smgr queries started, aggregated by query type.",
&["smgr_query_type"],
)
.expect("failed to define a metric")
});
static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
"pageserver_smgr_query_started_count",
"Number of smgr queries started, aggregated by query type and tenant/timeline.",
&["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
)
.expect("failed to define a metric")
});
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_smgr_query_seconds",
@@ -1342,20 +1322,14 @@ impl SmgrQueryTimePerTimeline {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
let timeline_id = timeline_id.to_string();
let global_started = std::array::from_fn(|i| {
let op = SmgrQueryType::from_repr(i).unwrap();
SMGR_QUERY_STARTED_GLOBAL
.get_metric_with_label_values(&[op.into()])
.unwrap()
});
let global_latency = std::array::from_fn(|i| {
let global_metrics = std::array::from_fn(|i| {
let op = SmgrQueryType::from_repr(i).unwrap();
SMGR_QUERY_TIME_GLOBAL
.get_metric_with_label_values(&[op.into()])
.unwrap()
});
let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
.get_metric_with_label_values(&[
SmgrQueryType::GetPageAtLsn.into(),
&tenant_id,
@@ -1363,20 +1337,9 @@ impl SmgrQueryTimePerTimeline {
&timeline_id,
])
.unwrap();
let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
.get_metric_with_label_values(&[
SmgrQueryType::GetPageAtLsn.into(),
&tenant_id,
&shard_slug,
&timeline_id,
])
.unwrap();
Self {
global_started,
global_latency,
per_timeline_getpage_latency,
per_timeline_getpage_started,
global_metrics,
per_timeline_getpage,
}
}
pub(crate) fn start_timer<'c: 'a, 'a>(
@@ -1384,11 +1347,16 @@ impl SmgrQueryTimePerTimeline {
op: SmgrQueryType,
ctx: &'c RequestContext,
) -> Option<impl Drop + '_> {
self.start_timer_many(op, 1, ctx)
}
pub(crate) fn start_timer_many<'c: 'a, 'a>(
&'a self,
op: SmgrQueryType,
count: usize,
ctx: &'c RequestContext,
) -> Option<impl Drop + '_> {
let global_metric = &self.global_metrics[op as usize];
let start = Instant::now();
self.global_started[op as usize].inc();
// We subtract time spent throttled from the observed latency.
match ctx.micros_spent_throttled.open() {
Ok(()) => (),
Err(error) => {
@@ -1407,19 +1375,19 @@ impl SmgrQueryTimePerTimeline {
}
}
let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
self.per_timeline_getpage_started.inc();
Some(&self.per_timeline_getpage_latency)
let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
Some(&self.per_timeline_getpage)
} else {
None
};
Some(GlobalAndPerTimelineHistogramTimer {
global_latency_histo: &self.global_latency[op as usize],
per_timeline_latency_histo,
global_metric,
timeline_metric,
ctx,
start,
op,
count,
})
}
}
@@ -1467,12 +1435,9 @@ mod smgr_query_time_tests {
let get_counts = || {
let global: u64 = ops
.iter()
.map(|op| metrics.global_latency[*op as usize].get_sample_count())
.map(|op| metrics.global_metrics[*op as usize].get_sample_count())
.sum();
(
global,
metrics.per_timeline_getpage_latency.get_sample_count(),
)
(global, metrics.per_timeline_getpage.get_sample_count())
};
let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1824,7 +1789,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
.expect("failed to define a metric"),
upload_heatmap_duration: register_histogram!(
"pageserver_secondary_upload_heatmap_duration",
"Time to build and upload a heatmap, including any waiting inside the remote storage client"
"Time to build and upload a heatmap, including any waiting inside the S3 client"
)
.expect("failed to define a metric"),
download_heatmap: register_int_counter!(
@@ -2623,12 +2588,6 @@ impl TimelineMetrics {
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
}
let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
SmgrQueryType::GetPageAtLsn.into(),
tenant_id,
shard_id,
timeline_id,
]);
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
SmgrQueryType::GetPageAtLsn.into(),
tenant_id,
@@ -2645,8 +2604,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
}
tenant_throttling::remove_tenant_metrics(tenant_shard_id);
// we leave the BROKEN_TENANTS_SET entry if any
}
@@ -3110,180 +3067,41 @@ pub mod tokio_epoll_uring {
pub(crate) mod tenant_throttling {
use metrics::{register_int_counter_vec, IntCounter};
use once_cell::sync::Lazy;
use utils::shard::TenantShardId;
use crate::tenant::{self, throttle::Metric};
struct GlobalAndPerTenantIntCounter {
global: IntCounter,
per_tenant: IntCounter,
}
impl GlobalAndPerTenantIntCounter {
#[inline(always)]
pub(crate) fn inc(&self) {
self.inc_by(1)
}
#[inline(always)]
pub(crate) fn inc_by(&self, n: u64) {
self.global.inc_by(n);
self.per_tenant.inc_by(n);
}
}
pub(crate) struct TimelineGet {
count_accounted_start: GlobalAndPerTenantIntCounter,
count_accounted_finish: GlobalAndPerTenantIntCounter,
wait_time: GlobalAndPerTenantIntCounter,
count_throttled: GlobalAndPerTenantIntCounter,
wait_time: IntCounter,
count: IntCounter,
}
static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_accounted_start_global",
"Count of tenant throttling starts, by kind of throttle.",
&["kind"]
)
.unwrap()
});
static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_accounted_start",
"Count of tenant throttling starts, by kind of throttle.",
&["kind", "tenant_id", "shard_id"]
)
.unwrap()
});
static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_accounted_finish_global",
"Count of tenant throttling finishes, by kind of throttle.",
&["kind"]
)
.unwrap()
});
static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_accounted_finish",
"Count of tenant throttling finishes, by kind of throttle.",
&["kind", "tenant_id", "shard_id"]
)
.unwrap()
});
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_wait_usecs_sum_global",
"Sum of microseconds that spent waiting throttle by kind of throttle.",
"Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
&["kind"]
)
.unwrap()
});
static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_wait_usecs_sum",
"Sum of microseconds that spent waiting throttle by kind of throttle.",
&["kind", "tenant_id", "shard_id"]
)
.unwrap()
.unwrap()
});
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_global",
"Count of tenant throttlings, by kind of throttle.",
&["kind"]
)
.unwrap()
});
let kind = "timeline_get";
TimelineGet {
wait_time: WAIT_USECS.with_label_values(&[kind]),
count: WAIT_COUNT.with_label_values(&[kind]),
}
});
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_global",
"Count of tenant throttlings, by kind of throttle.",
&["kind"]
)
.unwrap()
});
static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count",
"Count of tenant throttlings, by kind of throttle.",
&["kind", "tenant_id", "shard_id"]
)
.unwrap()
});
const KIND: &str = "timeline_get";
impl TimelineGet {
pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
TimelineGet {
count_accounted_start: {
GlobalAndPerTenantIntCounter {
global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
KIND,
&tenant_shard_id.tenant_id.to_string(),
&tenant_shard_id.shard_slug().to_string(),
]),
}
},
count_accounted_finish: {
GlobalAndPerTenantIntCounter {
global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
KIND,
&tenant_shard_id.tenant_id.to_string(),
&tenant_shard_id.shard_slug().to_string(),
]),
}
},
wait_time: {
GlobalAndPerTenantIntCounter {
global: WAIT_USECS.with_label_values(&[KIND]),
per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
KIND,
&tenant_shard_id.tenant_id.to_string(),
&tenant_shard_id.shard_slug().to_string(),
]),
}
},
count_throttled: {
GlobalAndPerTenantIntCounter {
global: WAIT_COUNT.with_label_values(&[KIND]),
per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
KIND,
&tenant_shard_id.tenant_id.to_string(),
&tenant_shard_id.shard_slug().to_string(),
]),
}
},
}
}
}
pub(crate) fn preinitialize_global_metrics() {
Lazy::force(&COUNT_ACCOUNTED_START);
Lazy::force(&COUNT_ACCOUNTED_FINISH);
Lazy::force(&WAIT_USECS);
Lazy::force(&WAIT_COUNT);
}
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
for m in &[
&COUNT_ACCOUNTED_START_PER_TENANT,
&COUNT_ACCOUNTED_FINISH_PER_TENANT,
&WAIT_USECS_PER_TENANT,
&WAIT_COUNT_PER_TENANT,
] {
let _ = m.remove_label_values(&[
KIND,
&tenant_shard_id.tenant_id.to_string(),
&tenant_shard_id.shard_slug().to_string(),
]);
}
}
impl Metric for TimelineGet {
#[inline(always)]
fn accounting_start(&self) {
self.count_accounted_start.inc();
}
#[inline(always)]
fn accounting_finish(&self) {
self.count_accounted_finish.inc();
}
impl Metric for &'static TimelineGet {
#[inline(always)]
fn observe_throttling(
&self,
@@ -3291,7 +3109,7 @@ pub(crate) mod tenant_throttling {
) {
let val = u64::try_from(wait_time.as_micros()).unwrap();
self.wait_time.inc_by(val);
self.count_throttled.inc();
self.count.inc();
}
}
}
@@ -3364,6 +3182,16 @@ static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
.unwrap()
});
pub(crate) static CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM: Lazy<Histogram> =
Lazy::new(|| {
register_histogram!(
"pageserver_consecutive_nonblocking_getpage_requests",
"Number of consecutive nonblocking getpage requests",
(0..=256).map(|x| x as f64).collect::<Vec<f64>>(),
)
.unwrap()
});
pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
let _guard = SERIALIZE.lock().unwrap();
@@ -3421,14 +3249,11 @@ pub fn preinitialize_metrics() {
}
// countervecs
[
&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
&SMGR_QUERY_STARTED_GLOBAL,
]
.into_iter()
.for_each(|c| {
Lazy::force(c);
});
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()
.for_each(|c| {
Lazy::force(c);
});
// gauges
WALRECEIVER_ACTIVE_MANAGERS.get();
@@ -3450,8 +3275,7 @@ pub fn preinitialize_metrics() {
// Custom
Lazy::force(&RECONSTRUCT_TIME);
Lazy::force(&tenant_throttling::TIMELINE_GET);
Lazy::force(&BASEBACKUP_QUERY_TIME);
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
tenant_throttling::preinitialize_global_metrics();
}

View File

@@ -5,14 +5,14 @@ use anyhow::Context;
use async_compression::tokio::write::GzipEncoder;
use bytes::Buf;
use futures::FutureExt;
use once_cell::sync::OnceCell;
use pageserver_api::models::TenantState;
use once_cell::sync::{Lazy, OnceCell};
use pageserver_api::models::{self, TenantState};
use pageserver_api::models::{
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
PagestreamNblocksResponse, PagestreamProtocolVersion,
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
PagestreamProtocolVersion,
};
use pageserver_api::shard::TenantShardId;
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
@@ -43,7 +43,7 @@ use crate::basebackup;
use crate::basebackup::BasebackupError;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics;
use crate::metrics::{self, CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM};
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
use crate::pgdatadir_mapping::Version;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -58,7 +58,7 @@ use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
use pageserver_api::key::rel_block_to_key;
use pageserver_api::reltag::SlruKind;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
@@ -577,124 +577,326 @@ impl PageServerHandler {
}
}
loop {
// read request bytes (it's exactly 1 PagestreamFeMessage per CopyData)
let msg = tokio::select! {
biased;
_ = self.cancel.cancelled() => {
return Err(QueryError::Shutdown)
}
msg = pgb.read_message() => { msg }
};
let copy_data_bytes = match msg? {
Some(FeMessage::CopyData(bytes)) => bytes,
Some(FeMessage::Terminate) => break,
Some(m) => {
return Err(QueryError::Other(anyhow::anyhow!(
"unexpected message: {m:?} during COPY"
)));
}
None => break, // client disconnected
};
let mut batched = None;
'outer: loop {
enum DebouncedFeMessage {
Exists(models::PagestreamExistsRequest),
Nblocks(models::PagestreamNblocksRequest),
GetPage {
span: Span,
shard: timeline::handle::Handle<TenantManagerTypes>,
effective_request_lsn: Lsn,
pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
},
DbSize(models::PagestreamDbSizeRequest),
GetSlruSegment(models::PagestreamGetSlruSegmentRequest),
RespondError(Span, PageStreamError),
}
let mut debounce: Option<std::time::Instant> = None;
// return or `?` on protocol error
// `break EXPR` to stop batching. The EXPR will be the first message in the next batch.
let next_batched: Option<DebouncedFeMessage> = loop {
static BOUNCE_TIMEOUT: Lazy<Duration> = Lazy::new(|| {
utils::env::var::<humantime::Duration, _>("NEON_PAGESERVER_DEBOUNCE")
.unwrap()
.into()
});
let sleep_fut = if let Some(started_at) = debounce {
futures::future::Either::Left(tokio::time::sleep_until(
(started_at + *BOUNCE_TIMEOUT).into(),
))
} else {
futures::future::Either::Right(futures::future::pending())
};
let msg = tokio::select! {
biased;
_ = self.cancel.cancelled() => {
return Err(QueryError::Shutdown)
}
msg = pgb.read_message() => {
msg
}
_ = sleep_fut => {
assert!(batched.is_some());
break None;
}
};
let copy_data_bytes = match msg? {
Some(FeMessage::CopyData(bytes)) => bytes,
Some(FeMessage::Terminate) => break 'outer,
Some(m) => {
return Err(QueryError::Other(anyhow::anyhow!(
"unexpected message: {m:?} during COPY"
)));
}
None => break 'outer, // client disconnected
};
trace!("query: {copy_data_bytes:?}");
fail::fail_point!("ps::handle-pagerequest-message");
trace!("query: {copy_data_bytes:?}");
fail::fail_point!("ps::handle-pagerequest-message");
// parse request
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
// parse request
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
let this_msg = match neon_fe_msg {
PagestreamFeMessage::Exists(msg) => DebouncedFeMessage::Exists(msg),
PagestreamFeMessage::Nblocks(msg) => DebouncedFeMessage::Nblocks(msg),
PagestreamFeMessage::DbSize(msg) => DebouncedFeMessage::DbSize(msg),
PagestreamFeMessage::GetSlruSegment(msg) => {
DebouncedFeMessage::GetSlruSegment(msg)
}
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn,
not_modified_since,
rel,
blkno,
}) => {
let span = tracing::info_span!("handle_get_page_at_lsn_request_batched", %tenant_id, %timeline_id, shard_id = tracing::field::Empty, req_lsn = %request_lsn, batch_size = tracing::field::Empty, batch_id = tracing::field::Empty);
let key = rel_block_to_key(rel, blkno);
let shard = match self
.timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Page(key))
.instrument(span.clone())
.await
{
Ok(tl) => tl,
Err(GetActiveTimelineError::Tenant(
GetActiveTenantError::NotFound(_),
)) => {
// We already know this tenant exists in general, because we resolved it at
// start of connection. Getting a NotFound here indicates that the shard containing
// the requested page is not present on this node: the client's knowledge of shard->pageserver
// mapping is out of date.
//
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
// and talk to a different pageserver.
break Some(DebouncedFeMessage::RespondError(
span,
PageStreamError::Reconnect(
"getpage@lsn request routed to wrong shard".into(),
),
));
}
Err(e) => break Some(DebouncedFeMessage::RespondError(span, e.into())),
};
let effective_request_lsn = match Self::wait_or_get_last_lsn(
&shard,
request_lsn,
not_modified_since,
&shard.get_latest_gc_cutoff_lsn(),
&ctx,
)
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
.await
{
Ok(lsn) => lsn,
Err(e) => {
break Some(DebouncedFeMessage::RespondError(span, e));
}
};
DebouncedFeMessage::GetPage {
span,
shard,
effective_request_lsn,
pages: smallvec::smallvec![(rel, blkno)],
}
}
};
// check if we can debounce
match (&mut batched, this_msg) {
(None, this_msg) => {
batched = Some(this_msg);
}
(
Some(DebouncedFeMessage::GetPage {
span: _,
shard: accum_shard,
pages: accum_pages,
effective_request_lsn: accum_lsn,
}),
DebouncedFeMessage::GetPage {
span: _,
shard: this_shard,
pages: this_pages,
effective_request_lsn: this_lsn,
},
) if async {
assert_eq!(this_pages.len(), 1);
if accum_pages.len() >= Timeline::MAX_GET_VECTORED_KEYS as usize {
assert_eq!(accum_pages.len(), Timeline::MAX_GET_VECTORED_KEYS as usize);
return false;
}
if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
!= (this_shard.tenant_shard_id, this_shard.timeline_id)
{
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logig for keeping responses in order does not support that.
return false;
}
// the vectored get currently only supports a single LSN, so, bounce as soon
// as the effective request_lsn changes
if (*accum_lsn != this_lsn) {
return false;
}
return true;
}
.await =>
{
// ok to batch
accum_pages.extend(this_pages);
}
(Some(_), this_msg) => {
// by default, don't continue batching
break Some(this_msg);
}
}
// debounce impl piece
let started_at = debounce.get_or_insert_with(Instant::now);
if started_at.elapsed() > *BOUNCE_TIMEOUT {
break None;
}
};
// invoke handler function
let (handler_result, span) = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
let (handler_results, span): (
smallvec::SmallVec<[Result<PagestreamBeMessage, PageStreamError>; 1]>,
_,
) = match batched.take().expect("loop above ensures this") {
DebouncedFeMessage::Exists(req) => {
fail::fail_point!("ps::handle-pagerequest-message::exists");
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
(
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
.await,
smallvec::smallvec![
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
.await
],
span,
)
}
PagestreamFeMessage::Nblocks(req) => {
DebouncedFeMessage::Nblocks(req) => {
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
(
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
.await,
smallvec::smallvec![
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
.await,
],
span,
)
}
PagestreamFeMessage::GetPage(req) => {
DebouncedFeMessage::GetPage {
span,
shard,
effective_request_lsn,
pages,
} => {
CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM.observe(pages.len() as f64);
span.record("batch_size", pages.len() as u64);
static BATCH_ID: Lazy<std::sync::atomic::AtomicUsize> =
Lazy::new(|| std::sync::atomic::AtomicUsize::new(0));
span.record(
"batch_id",
BATCH_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed) as u64,
);
fail::fail_point!("ps::handle-pagerequest-message::getpage");
// shard_id is filled in by the handler
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
(
self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
.await,
{
let npages = pages.len();
let res = self
.handle_get_page_at_lsn_request_batched(
&shard,
effective_request_lsn,
pages,
&ctx,
)
.instrument(span.clone())
.await;
assert_eq!(res.len(), npages);
res
},
span,
)
}
PagestreamFeMessage::DbSize(req) => {
DebouncedFeMessage::DbSize(req) => {
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
(
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
.await,
smallvec::smallvec![
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
.await
],
span,
)
}
PagestreamFeMessage::GetSlruSegment(req) => {
DebouncedFeMessage::GetSlruSegment(req) => {
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
(
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
smallvec::smallvec![
self.handle_get_slru_segment_request(
tenant_id,
timeline_id,
&req,
&ctx
)
.instrument(span.clone())
.await,
.await
],
span,
)
}
DebouncedFeMessage::RespondError(span, e) => {
// We've already decided to respond with an error, so we don't need to
// call the handler.
(smallvec::smallvec![Err(e)], span)
}
};
// Map handler result to protocol behavior.
// Some handler errors cause exit from pagestream protocol.
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
let response_msg = match handler_result {
Err(e) => match &e {
PageStreamError::Shutdown => {
// If we fail to fulfil a request during shutdown, which may be _because_ of
// shutdown, then do not send the error to the client. Instead just drop the
// connection.
span.in_scope(|| info!("dropping connection due to shutdown"));
return Err(QueryError::Shutdown);
}
PageStreamError::Reconnect(reason) => {
span.in_scope(|| info!("handler requested reconnect: {reason}"));
return Err(QueryError::Reconnect);
}
PageStreamError::Read(_)
| PageStreamError::LsnTimeout(_)
| PageStreamError::NotFound(_)
| PageStreamError::BadRequest(_) => {
// print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
let full = utils::error::report_compact_sources(&e);
span.in_scope(|| {
error!("error reading relation or page version: {full:#}")
});
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})
}
},
Ok(response_msg) => response_msg,
};
for handler_result in handler_results {
let response_msg = match handler_result {
Err(e) => match &e {
PageStreamError::Shutdown => {
// If we fail to fulfil a request during shutdown, which may be _because_ of
// shutdown, then do not send the error to the client. Instead just drop the
// connection.
span.in_scope(|| info!("dropping connection due to shutdown"));
return Err(QueryError::Shutdown);
}
PageStreamError::Reconnect(reason) => {
span.in_scope(|| info!("handler requested reconnect: {reason}"));
return Err(QueryError::Reconnect);
}
PageStreamError::Read(_)
| PageStreamError::LsnTimeout(_)
| PageStreamError::NotFound(_)
| PageStreamError::BadRequest(_) => {
// print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
let full = utils::error::report_compact_sources(&e);
span.in_scope(|| {
error!("error reading relation or page version: {full:#}")
});
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})
}
},
Ok(response_msg) => response_msg,
};
// marshal & transmit response message
pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
// marshal & transmit response message
pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
}
tokio::select! {
biased;
_ = self.cancel.cancelled() => {
@@ -706,6 +908,9 @@ impl PageServerHandler {
res?;
}
}
assert!(batched.is_none(), "we take() earlier");
batched = next_batched;
}
Ok(())
}
@@ -949,60 +1154,30 @@ impl PageServerHandler {
}))
}
#[instrument(skip_all, fields(shard_id))]
async fn handle_get_page_at_lsn_request(
#[instrument(skip_all)]
async fn handle_get_page_at_lsn_request_batched(
&mut self,
tenant_id: TenantId,
timeline_id: TimelineId,
req: &PagestreamGetPageRequest,
timeline: &Timeline,
effective_lsn: Lsn,
pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let timeline = match self
.timeline_handles
.get(
tenant_id,
timeline_id,
ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)),
)
.await
{
Ok(tl) => tl,
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
// We already know this tenant exists in general, because we resolved it at
// start of connection. Getting a NotFound here indicates that the shard containing
// the requested page is not present on this node: the client's knowledge of shard->pageserver
// mapping is out of date.
//
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
// and talk to a different pageserver.
return Err(PageStreamError::Reconnect(
"getpage@lsn request routed to wrong shard".into(),
));
}
Err(e) => return Err(e.into()),
};
let _timer = timeline
.query_metrics
.start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
&timeline,
req.request_lsn,
req.not_modified_since,
&latest_gc_cutoff_lsn,
) -> smallvec::SmallVec<[Result<PagestreamBeMessage, PageStreamError>; 1]> {
debug_assert_current_span_has_tenant_and_timeline_id();
let _timer = timeline.query_metrics.start_timer_many(
metrics::SmgrQueryType::GetPageAtLsn,
pages.len(),
ctx,
)
.await?;
);
let page = timeline
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
.await?;
let pages = timeline
.get_rel_page_at_lsn_batched(pages, Version::Lsn(effective_lsn), ctx)
.await;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
smallvec::SmallVec::from_iter(pages.into_iter().map(|page| {
page.map(|page| {
PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page })
})
.map_err(PageStreamError::Read)
}))
}
@@ -1199,6 +1374,7 @@ impl PageServerHandler {
}
}
#[async_trait::async_trait]
impl<IO> postgres_backend::Handler<IO> for PageServerHandler
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1498,3 +1674,10 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
);
debug_assert_current_span_has_tenant_and_timeline_id();
}
struct WaitedForLsn(Lsn);
impl From<WaitedForLsn> for Lsn {
fn from(WaitedForLsn(lsn): WaitedForLsn) -> Self {
lsn
}
}

View File

@@ -9,12 +9,17 @@
use super::tenant::{PageReconstructError, Timeline};
use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::span::{
debug_assert_current_span_has_tenant_and_timeline_id,
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
};
use crate::tenant::timeline::GetVectoredError;
use crate::walrecord::NeonWalRecord;
use crate::{aux_file, repository::*};
use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
use itertools::Itertools;
use pageserver_api::key::{
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
@@ -28,7 +33,7 @@ use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap, HashSet};
use std::collections::{hash_map, BTreeMap, HashMap, HashSet};
use std::ops::ControlFlow;
use std::ops::Range;
use strum::IntoEnumIterator;
@@ -191,26 +196,184 @@ impl Timeline {
version: Version<'_>,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
if tag.relnode == 0 {
return Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
));
}
let pages = smallvec::smallvec![(tag, blknum)];
let res = self.get_rel_page_at_lsn_batched(pages, version, ctx).await;
assert_eq!(res.len(), 1);
res.into_iter().next().unwrap()
}
let nblocks = self.get_rel_size(tag, version, ctx).await?;
if blknum >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
tag,
blknum,
version.get_lsn(),
nblocks
);
return Ok(ZERO_PAGE.clone());
/// Like [`get_rel_page_at_lsn`], but returns a batch of pages.
pub(crate) async fn get_rel_page_at_lsn_batched(
&self,
pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
version: Version<'_>,
ctx: &RequestContext,
) -> smallvec::SmallVec<[Result<Bytes, PageReconstructError>; 1]> {
debug_assert_current_span_has_tenant_and_timeline_id();
let request_lsn = match version {
Version::Lsn(lsn) => lsn,
Version::Modified(_) => panic!("unsupported"),
};
enum KeyState {
NeedsVectoredGet,
Done(Result<Bytes, PageReconstructError>),
}
let mut key_states = BTreeMap::new();
let mut vectored_gets: smallvec::SmallVec<[_; 1]> =
smallvec::SmallVec::with_capacity(pages.len());
for (response_order, (tag, blknum)) in pages.into_iter().enumerate() {
let key = rel_block_to_key(tag, blknum);
use std::collections::btree_map::Entry;
let key_state_slot = match key_states.entry((key, response_order)) {
Entry::Occupied(_entry) => unreachable!(
"enumerate makes keys unique, even if batch contains same key twice"
),
Entry::Vacant(entry) => entry,
};
let key = rel_block_to_key(tag, blknum);
version.get(self, key, ctx).await
if tag.relnode == 0 {
key_state_slot.insert(KeyState::Done(Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
))));
continue;
}
let nblocks = match self.get_rel_size(tag, version, ctx).await {
Ok(nblocks) => nblocks,
Err(err) => {
key_state_slot.insert(KeyState::Done(Err(err)));
continue;
}
};
if blknum >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
tag,
blknum,
version.get_lsn(),
nblocks
);
key_state_slot.insert(KeyState::Done(Ok(ZERO_PAGE.clone())));
continue;
}
vectored_gets.push(key);
key_state_slot.insert(KeyState::NeedsVectoredGet);
}
// turn vectored_gets into a keyspace
let keyspace = {
// add_key reuqires monotonicity
vectored_gets.sort_unstable();
let mut acc = KeySpaceAccum::new();
for key in vectored_gets
.into_iter()
// in fact it requires strong monotonicity
.dedup()
{
acc.add_key(key);
}
acc.to_keyspace()
};
match self.get_vectored(keyspace, request_lsn, ctx).await {
Ok(results) => {
for (key, res) in results {
if let Err(err) = &res {
warn!(%key, ?err, "a key inside get_vectored failed with a per-key error");
}
let mut interests = key_states.range_mut((key, 0)..(key.next(), 0)).peekable();
let first_interest = interests.next().unwrap();
let next_interest = interests.peek().is_some();
if !next_interest {
match first_interest.1 {
KeyState::NeedsVectoredGet => {
*first_interest.1 = KeyState::Done(res);
}
KeyState::Done(_) => unreachable!(),
}
continue;
} else {
for ((_, _), state) in [first_interest].into_iter().chain(interests) {
match state {
KeyState::NeedsVectoredGet => {
*state = KeyState::Done(match &res {
Ok(buf) => Ok(buf.clone()),
// this `match` is working around the fact that we cannot Clone the PageReconstructError
Err(err) => Err(match err {
PageReconstructError::Cancelled => {
PageReconstructError::Cancelled
}
x @ PageReconstructError::Other(_) |
x @ PageReconstructError::AncestorLsnTimeout(_) |
x @ PageReconstructError::WalRedo(_) |
x @ PageReconstructError::MissingKey(_) => {
PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}"))
},
}),
});
}
KeyState::Done(_) => unreachable!(),
}
}
}
}
}
Err(err) => {
warn!(?err, "get_vectored failed with a global error, mapping that error to per-key failure");
// this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size
for ((_, _), state) in key_states.iter_mut() {
// this whole `match` is a lot like `From<GetVectoredError> for PageReconstructError`
// but without taking ownership of the GetVectoredError
match &err {
GetVectoredError::Cancelled => {
*state = KeyState::Done(Err(PageReconstructError::Cancelled));
}
// TODO: restructure get_vectored API to make this error per-key
GetVectoredError::MissingKey(err) => {
*state = KeyState::Done(Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}"))));
}
// TODO: restructure get_vectored API to make this error per-key
GetVectoredError::GetReadyAncestorError(err) => {
*state = KeyState::Done(Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"))));
}
// TODO: restructure get_vectored API to make this error per-key
GetVectoredError::Other(err) => {
*state = KeyState::Done(Err(PageReconstructError::Other(
anyhow::anyhow!("whole vectored get request failed: {err:?}"),
)));
}
// TODO: we can prevent this error class by moving this check into the type system
GetVectoredError::InvalidLsn(e) => {
*state =
KeyState::Done(Err(anyhow::anyhow!("invalid LSN: {e:?}").into()));
}
// NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS
// TODO: we can prevent this error class by moving this check into the type system
GetVectoredError::Oversized(err) => {
*state = KeyState::Done(Err(anyhow::anyhow!(
"batching oversized: {err:?}"
)
.into()));
}
}
}
}
};
// get the results into the order in which they were requested
let mut return_order: smallvec::SmallVec<[_; Timeline::MAX_GET_VECTORED_KEYS as usize]> =
smallvec::SmallVec::with_capacity(key_states.len());
return_order.extend(key_states.keys().map(|(key, idx)| (*key, *idx)));
return_order.sort_unstable_by_key(|(_, idx)| *idx);
let mut res = smallvec::SmallVec::with_capacity(key_states.len());
res.extend(return_order.into_iter().map(|key_states_key| {
match key_states.remove(&key_states_key).unwrap() {
KeyState::Done(res) => res,
KeyState::NeedsVectoredGet => unreachable!(),
}
}));
res
}
// Get size of a database in blocks
@@ -633,7 +796,7 @@ impl Timeline {
pub(crate) async fn get_twophase_file(
&self,
xid: u64,
xid: TransactionId,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
@@ -646,19 +809,11 @@ impl Timeline {
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashSet<u64>, PageReconstructError> {
) -> Result<HashSet<TransactionId>, PageReconstructError> {
// fetch directory entry
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
if self.pg_version >= 17 {
Ok(TwoPhaseDirectoryV17::des(&buf)?.xids)
} else {
Ok(TwoPhaseDirectory::des(&buf)?
.xids
.iter()
.map(|x| u64::from(*x))
.collect())
}
Ok(TwoPhaseDirectory::des(&buf)?.xids)
}
pub(crate) async fn get_control_file(
@@ -910,13 +1065,9 @@ impl Timeline {
// Then pg_twophase
result.add_key(TWOPHASEDIR_KEY);
let mut xids: Vec<u64> = self
.list_twophase_files(lsn, ctx)
.await?
.iter()
.cloned()
.collect();
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
let twophase_dir = TwoPhaseDirectory::des(&buf)?;
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
xids.sort_unstable();
for xid in xids {
result.add_key(twophase_file_key(xid));
@@ -1139,15 +1290,9 @@ impl<'a> DatadirModification<'a> {
// Create AuxFilesDirectory
self.init_aux_dir()?;
let buf = if self.tline.pg_version >= 17 {
TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
xids: HashSet::new(),
})
} else {
TwoPhaseDirectory::ser(&TwoPhaseDirectory {
xids: HashSet::new(),
})
}?;
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
xids: HashSet::new(),
})?;
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, 0));
self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
@@ -1223,13 +1368,6 @@ impl<'a> DatadirModification<'a> {
img: Bytes,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
let key = rel_block_to_key(rel, blknum);
if !key.is_valid_key_on_write_path() {
anyhow::bail!(
"the request contains data not supported by pageserver at {}",
key
);
}
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
Ok(())
}
@@ -1241,34 +1379,14 @@ impl<'a> DatadirModification<'a> {
blknum: BlockNumber,
img: Bytes,
) -> anyhow::Result<()> {
let key = slru_block_to_key(kind, segno, blknum);
if !key.is_valid_key_on_write_path() {
anyhow::bail!(
"the request contains data not supported by pageserver at {}",
key
);
}
self.put(key, Value::Image(img));
self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
Ok(())
}
pub(crate) fn put_rel_page_image_zero(
&mut self,
rel: RelTag,
blknum: BlockNumber,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
let key = rel_block_to_key(rel, blknum);
if !key.is_valid_key_on_write_path() {
anyhow::bail!(
"the request contains data not supported by pageserver: {} @ {}",
key,
self.lsn
);
}
self.pending_zero_data_pages.insert(key.to_compact());
pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) {
self.pending_zero_data_pages
.insert(rel_block_to_key(rel, blknum).to_compact());
self.pending_bytes += ZERO_PAGE.len();
Ok(())
}
pub(crate) fn put_slru_page_image_zero(
@@ -1276,18 +1394,10 @@ impl<'a> DatadirModification<'a> {
kind: SlruKind,
segno: u32,
blknum: BlockNumber,
) -> anyhow::Result<()> {
let key = slru_block_to_key(kind, segno, blknum);
if !key.is_valid_key_on_write_path() {
anyhow::bail!(
"the request contains data not supported by pageserver: {} @ {}",
key,
self.lsn
);
}
self.pending_zero_data_pages.insert(key.to_compact());
) {
self.pending_zero_data_pages
.insert(slru_block_to_key(kind, segno, blknum).to_compact());
self.pending_bytes += ZERO_PAGE.len();
Ok(())
}
/// Call this at the end of each WAL record.
@@ -1339,31 +1449,22 @@ impl<'a> DatadirModification<'a> {
pub async fn put_twophase_file(
&mut self,
xid: u64,
xid: TransactionId,
img: Bytes,
ctx: &RequestContext,
) -> anyhow::Result<()> {
// Add it to the directory entry
let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let newdirbuf = if self.tline.pg_version >= 17 {
let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
if !dir.xids.insert(xid) {
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
} else {
let xid = xid as u32;
let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
if !dir.xids.insert(xid) {
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
Bytes::from(TwoPhaseDirectory::ser(&dir)?)
};
self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.insert(xid) {
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
self.put(
TWOPHASEDIR_KEY,
Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
);
self.put(twophase_file_key(xid), Value::Image(img));
Ok(())
@@ -1666,32 +1767,22 @@ impl<'a> DatadirModification<'a> {
/// This method is used for marking truncated SLRU files
pub async fn drop_twophase_file(
&mut self,
xid: u64,
xid: TransactionId,
ctx: &RequestContext,
) -> anyhow::Result<()> {
// Remove it from the directory entry
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let newdirbuf = if self.tline.pg_version >= 17 {
let mut dir = TwoPhaseDirectoryV17::des(&buf)?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.remove(&xid) {
warn!("twophase file for xid {} does not exist", xid);
}
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
} else {
let xid: u32 = u32::try_from(xid)?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.remove(&xid) {
warn!("twophase file for xid {} does not exist", xid);
}
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
Bytes::from(TwoPhaseDirectory::ser(&dir)?)
};
self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
if !dir.xids.remove(&xid) {
warn!("twophase file for xid {} does not exist", xid);
}
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
self.put(
TWOPHASEDIR_KEY,
Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
);
// Delete it
self.delete(twophase_key_range(xid));
@@ -2161,21 +2252,11 @@ struct DbDirectory {
dbdirs: HashMap<(Oid, Oid), bool>,
}
// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of
// pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs. Previously, the files
// were named like "pg_twophase/000002E5", now they're like
// "pg_twophsae/0000000A000002E4".
#[derive(Debug, Serialize, Deserialize)]
struct TwoPhaseDirectory {
xids: HashSet<TransactionId>,
}
#[derive(Debug, Serialize, Deserialize)]
struct TwoPhaseDirectoryV17 {
xids: HashSet<u64>,
}
#[derive(Debug, Serialize, Deserialize, Default)]
struct RelDirectory {
// Set of relations that exist. (relfilenode, forknum)

View File

@@ -73,6 +73,21 @@ impl ValueBytes {
Ok(raw[8] == 1)
}
pub(crate) fn is_image(raw: &[u8]) -> Result<bool, InvalidInput> {
if raw.len() < 12 {
return Err(InvalidInput::TooShortValue);
}
let value_discriminator = &raw[0..4];
if value_discriminator == [0, 0, 0, 0] {
// Value::Image always initializes
return Ok(true);
}
Ok(false)
}
}
#[cfg(test)]

View File

@@ -140,7 +140,6 @@ pub mod metadata;
pub mod remote_timeline_client;
pub mod storage_layer;
pub mod checks;
pub mod config;
pub mod mgr;
pub mod secondary;
@@ -302,7 +301,7 @@ pub struct Tenant {
/// Throttle applied at the top of [`Timeline::get`].
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
pub(crate) timeline_get_throttle:
Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
/// An ongoing timeline detach concurrency limiter.
///
@@ -1574,9 +1573,6 @@ impl Tenant {
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
end_lsn: Lsn,
) -> anyhow::Result<Arc<Timeline>> {
use checks::check_valid_layermap;
use itertools::Itertools;
let tline = self
.create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
.await?;
@@ -1591,18 +1587,6 @@ impl Tenant {
.force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
.await?;
}
let layer_names = tline
.layers
.read()
.await
.layer_map()
.unwrap()
.iter_historic_layers()
.map(|layer| layer.layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
bail!("invalid layermap: {err}");
}
Ok(tline)
}
@@ -2831,7 +2815,7 @@ impl Tenant {
gate: Gate::default(),
timeline_get_throttle: Arc::new(throttle::Throttle::new(
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
&crate::metrics::tenant_throttling::TIMELINE_GET,
)),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
ongoing_timeline_detach: std::sync::Mutex::default(),
@@ -3213,9 +3197,6 @@ impl Tenant {
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
end_lsn: Lsn,
) -> anyhow::Result<Arc<Timeline>> {
use checks::check_valid_layermap;
use itertools::Itertools;
let tline = self
.branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
.await?;
@@ -3236,18 +3217,6 @@ impl Tenant {
.force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
.await?;
}
let layer_names = tline
.layers
.read()
.await
.layer_map()
.unwrap()
.iter_historic_layers()
.map(|layer| layer.layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
bail!("invalid layermap: {err}");
}
Ok(tline)
}
@@ -3522,7 +3491,7 @@ impl Tenant {
.context("extract initdb tar")?;
} else {
// Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
run_initdb_with_cache(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
// Upload the created data dir to S3
if self.tenant_shard_id().is_shard_zero() {
@@ -3868,118 +3837,6 @@ impl Tenant {
}
}
fn cached_initdb_dirname(initial_superuser_name: &str, pg_version: u32) -> String
{
use std::hash::Hash;
use std::hash::Hasher;
use std::collections::hash_map::DefaultHasher;
let mut hasher = DefaultHasher::new();
initial_superuser_name.hash(&mut hasher);
let hash = hasher.finish();
format!("cached_initial_pgdata_{pg_version}_{:016}", hash)
}
fn copy_dir_all(src: impl AsRef<std::path::Path>, dst: impl AsRef<std::path::Path>) -> std::io::Result<()> {
for entry in fs::read_dir(src.as_ref())? {
let entry = entry?;
let subsrc = entry.path();
let subdst = dst.as_ref().join(&entry.file_name());
if entry.file_type()?.is_dir() {
std::fs::create_dir(&subdst)?;
copy_dir_all(&subsrc, &subdst)?;
} else {
std::fs::copy(&subsrc, &subdst)?;
}
}
Ok(())
}
fn restore_cached_initdb_dir(
cached_path: &Utf8Path,
target_path: &Utf8Path,
) -> anyhow::Result<bool> {
if !cached_path.exists() {
info!("cached initdb dir \"{cached_path}\" does not exist yet");
return Ok(false);
}
std::fs::create_dir(target_path)?;
copy_dir_all(cached_path, target_path)?;
info!("restored initdb result from cache dir \"{cached_path}\"");
Ok(true)
}
fn save_cached_initdb_dir(
src_path: &Utf8Path,
cache_path: &Utf8Path,
) -> anyhow::Result<()> {
match std::fs::create_dir(cache_path) {
Ok(()) => {
info!("saving initdb result to cache dir \"{cache_path}\"");
},
Err(err) if err.kind() == std::io::ErrorKind::AlreadyExists => {
info!("cache initdb dir \"{cache_path}\" already exists, not saving");
return Ok(())
},
Err(err) => { return Err(anyhow::Error::from(err))},
};
let cache_dir_guard = scopeguard::guard(cache_path, |cp| {
if let Err(err) = std::fs::remove_dir_all(&cp) {
error!("could not remove cached initdb directory {cp}: {err}");
}
});
let cache_parent_path = cache_path.parent().ok_or(anyhow::Error::msg("no cache parent path"))?;
let tmp_dirpath = camino_tempfile::tempdir_in(cache_parent_path)?;
copy_dir_all(src_path, &tmp_dirpath)?;
std::fs::rename(tmp_dirpath, &*cache_dir_guard)?;
// disarm the guard
scopeguard::ScopeGuard::into_inner(cache_dir_guard);
Ok(())
}
async fn run_initdb_with_cache(
conf: &'static PageServerConf,
initdb_target_dir: &Utf8Path,
pg_version: u32,
cancel: &CancellationToken,
) -> Result<(), InitdbError> {
let cache_dir = conf.initdb_cache_dir.as_ref().map(|initdb_cache_dir| {
initdb_cache_dir.join(cached_initdb_dirname(&conf.superuser, pg_version))
});
if let Some(cache_dir) = &cache_dir {
match restore_cached_initdb_dir(&cache_dir, initdb_target_dir) {
Ok(true) => return Ok(()),
Ok(false) => {},
Err(err) => {
warn!("Error restoring from cached initdb directory \"{cache_dir}\": {err}");
if initdb_target_dir.exists() {
if let Err(err) = std::fs::remove_dir_all(&initdb_target_dir) {
error!("could not remove temporary initdb target directory {initdb_target_dir}: {err}");
}
}
},
}
}
run_initdb(conf, initdb_target_dir, pg_version, cancel).await?;
if let Some(cache_dir) = &cache_dir {
if let Err(err) = save_cached_initdb_dir(initdb_target_dir, &cache_dir) {
warn!("error saving initdb result to cache directory \"{cache_dir}\": {err}");
}
}
Ok(())
}
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
/// to get bootstrap data for timeline initialization.
async fn run_initdb(
@@ -4307,18 +4164,9 @@ pub(crate) mod harness {
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
if records_neon {
// For Neon wal records, we can decode without spawning postgres, so do so.
let mut page = match (base_img, records.first()) {
(Some((_lsn, img)), _) => {
let mut page = BytesMut::new();
page.extend_from_slice(&img);
page
}
(_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(),
_ => {
panic!("Neon WAL redo requires base image or will init record");
}
};
let base_img = base_img.expect("Neon WAL redo requires base image").1;
let mut page = BytesMut::new();
page.extend_from_slice(&base_img);
for (record_lsn, record) in records {
apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
}
@@ -7243,13 +7091,13 @@ mod tests {
vec![
// Image layer at GC horizon
PersistentLayerKey {
key_range: Key::MIN..Key::MAX,
key_range: Key::MIN..Key::NON_L0_MAX,
lsn_range: Lsn(0x30)..Lsn(0x31),
is_delta: false
},
// The delta layer below the horizon
// The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
PersistentLayerKey {
key_range: get_key(3)..get_key(4),
key_range: Key::MIN..Key::NON_L0_MAX,
lsn_range: Lsn(0x30)..Lsn(0x48),
is_delta: true
},
@@ -8622,135 +8470,4 @@ mod tests {
Ok(())
}
// Regression test for https://github.com/neondatabase/neon/issues/9012
// Create an image arrangement where we have to read at different LSN ranges
// from a delta layer. This is achieved by overlapping an image layer on top of
// a delta layer. Like so:
//
// A B
// +----------------+ -> delta_layer
// | | ^ lsn
// | =========|-> nested_image_layer |
// | C | |
// +----------------+ |
// ======== -> baseline_image_layer +-------> key
//
//
// When querying the key range [A, B) we need to read at different LSN ranges
// for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
#[tokio::test]
async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
let (tenant, ctx) = harness.load().await;
let will_init_keys = [2, 6];
fn get_key(id: u32) -> Key {
let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
let mut expected_key_values = HashMap::new();
let baseline_image_layer_lsn = Lsn(0x10);
let mut baseline_img_layer = Vec::new();
for i in 0..5 {
let key = get_key(i);
let value = format!("value {i}@{baseline_image_layer_lsn}");
let removed = expected_key_values.insert(key, value.clone());
assert!(removed.is_none());
baseline_img_layer.push((key, Bytes::from(value)));
}
let nested_image_layer_lsn = Lsn(0x50);
let mut nested_img_layer = Vec::new();
for i in 5..10 {
let key = get_key(i);
let value = format!("value {i}@{nested_image_layer_lsn}");
let removed = expected_key_values.insert(key, value.clone());
assert!(removed.is_none());
nested_img_layer.push((key, Bytes::from(value)));
}
let mut delta_layer_spec = Vec::default();
let delta_layer_start_lsn = Lsn(0x20);
let mut delta_layer_end_lsn = delta_layer_start_lsn;
for i in 0..10 {
let key = get_key(i);
let key_in_nested = nested_img_layer
.iter()
.any(|(key_with_img, _)| *key_with_img == key);
let lsn = {
if key_in_nested {
Lsn(nested_image_layer_lsn.0 + 0x10)
} else {
delta_layer_start_lsn
}
};
let will_init = will_init_keys.contains(&i);
if will_init {
delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
expected_key_values.insert(key, "".to_string());
} else {
let delta = format!("@{lsn}");
delta_layer_spec.push((
key,
lsn,
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
));
expected_key_values
.get_mut(&key)
.expect("An image exists for each key")
.push_str(delta.as_str());
}
delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
}
delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1);
assert!(
nested_image_layer_lsn > delta_layer_start_lsn
&& nested_image_layer_lsn < delta_layer_end_lsn
);
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
baseline_image_layer_lsn,
DEFAULT_PG_VERSION,
&ctx,
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
delta_layer_start_lsn..delta_layer_end_lsn,
delta_layer_spec,
)], // delta layers
vec![
(baseline_image_layer_lsn, baseline_img_layer),
(nested_image_layer_lsn, nested_img_layer),
], // image layers
delta_layer_end_lsn,
)
.await?;
let keyspace = KeySpace::single(get_key(0)..get_key(10));
let results = tline
.get_vectored(keyspace, delta_layer_end_lsn, &ctx)
.await
.expect("No vectored errors");
for (key, res) in results {
let value = res.expect("No key errors");
let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
assert_eq!(value, Bytes::from(expected_value));
}
Ok(())
}
}

View File

@@ -1,55 +0,0 @@
use std::collections::BTreeSet;
use itertools::Itertools;
use super::storage_layer::LayerName;
/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
///
/// ```plain
/// | | | |
/// | 1 | | 2 | | 3 |
/// | | | | | |
/// ```
///
/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
/// the same LSN range.
///
/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
///
/// ```plain
/// | | | 2 | | |
/// | 1 | |-------| | 3 |
/// | | | 4 | | |
///
/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
let mut all_delta_layers = Vec::new();
for name in metadata {
if let LayerName::Delta(layer) = name {
if layer.key_range.start.next() != layer.key_range.end {
all_delta_layers.push(layer.clone());
}
}
}
for layer in &all_delta_layers {
let lsn_range = &layer.lsn_range;
lsn_split_point.insert(lsn_range.start);
lsn_split_point.insert(lsn_range.end);
}
for layer in &all_delta_layers {
let lsn_range = layer.lsn_range.clone();
let intersects = lsn_split_point.range(lsn_range).collect_vec();
if intersects.len() > 1 {
let err = format!(
"layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
layer,
intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
);
return Some(err);
}
}
None
}

View File

@@ -452,8 +452,7 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
}
toml_edit::Item::Table(table) => {
let deserializer =
toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
let deserializer = toml_edit::de::Deserializer::new(table.into());
return serde_path_to_error::deserialize(deserializer)
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
}

View File

@@ -1,29 +1,11 @@
use std::{collections::HashMap, time::Duration};
use std::collections::HashMap;
use super::remote_timeline_client::index::GcBlockingReason;
use tokio::time::Instant;
use utils::id::TimelineId;
type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
use super::remote_timeline_client::index::GcBlockingReason;
#[derive(Default)]
struct Storage {
timelines_blocked: TimelinesBlocked,
/// The deadline before which we are blocked from GC so that
/// leases have a chance to be renewed.
lsn_lease_deadline: Option<Instant>,
}
type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
impl Storage {
fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
self.lsn_lease_deadline
.map(|d| Instant::now() < d)
.unwrap_or(false)
}
}
/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
/// blocking.
#[derive(Default)]
pub(crate) struct GcBlock {
/// The timelines which have current reasons to block gc.
@@ -31,12 +13,6 @@ pub(crate) struct GcBlock {
/// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
/// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
reasons: std::sync::Mutex<Storage>,
/// GC background task or manually run `Tenant::gc_iteration` holds a lock on this.
///
/// Do not add any more features taking and forbidding taking this lock. It should be
/// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
/// synchronizes with gc attempts by locking and unlocking this mutex.
blocking: tokio::sync::Mutex<()>,
}
@@ -66,20 +42,6 @@ impl GcBlock {
}
}
/// Sets a deadline before which we cannot proceed to GC due to lsn lease.
///
/// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
/// length, we guarantee that all the leases we granted before will have a chance to renew
/// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
let deadline = Instant::now() + lsn_lease_length;
let mut g = self.reasons.lock().unwrap();
g.lsn_lease_deadline = Some(deadline);
}
/// Describe the current gc blocking reasons.
///
/// TODO: make this json serializable.
pub(crate) fn summary(&self) -> Option<BlockingReasons> {
let g = self.reasons.lock().unwrap();
@@ -102,7 +64,7 @@ impl GcBlock {
) -> anyhow::Result<bool> {
let (added, uploaded) = {
let mut g = self.reasons.lock().unwrap();
let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
let set = g.entry(timeline.timeline_id).or_default();
let added = set.insert(reason);
// LOCK ORDER: intentionally hold the lock, see self.reasons.
@@ -133,7 +95,7 @@ impl GcBlock {
let (remaining_blocks, uploaded) = {
let mut g = self.reasons.lock().unwrap();
match g.timelines_blocked.entry(timeline.timeline_id) {
match g.entry(timeline.timeline_id) {
Entry::Occupied(mut oe) => {
let set = oe.get_mut();
set.remove(reason);
@@ -147,7 +109,7 @@ impl GcBlock {
}
}
let remaining_blocks = g.timelines_blocked.len();
let remaining_blocks = g.len();
// LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
let uploaded = timeline
@@ -172,11 +134,11 @@ impl GcBlock {
pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
let unblocked = {
let mut g = self.reasons.lock().unwrap();
if g.timelines_blocked.is_empty() {
if g.is_empty() {
return;
}
g.timelines_blocked.remove(&timeline.timeline_id);
g.remove(&timeline.timeline_id);
BlockingReasons::clean_and_summarize(g).is_none()
};
@@ -187,11 +149,10 @@ impl GcBlock {
}
/// Initialize with the non-deleted timelines of this tenant.
pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
pub(crate) fn set_scanned(&self, scanned: Storage) {
let mut g = self.reasons.lock().unwrap();
assert!(g.timelines_blocked.is_empty());
g.timelines_blocked
.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
assert!(g.is_empty());
g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
tracing::info!(summary=?reasons, "initialized with gc blocked");
@@ -205,7 +166,6 @@ pub(super) struct Guard<'a> {
#[derive(Debug)]
pub(crate) struct BlockingReasons {
tenant_blocked_by_lsn_lease_deadline: bool,
timelines: usize,
reasons: enumset::EnumSet<GcBlockingReason>,
}
@@ -214,8 +174,8 @@ impl std::fmt::Display for BlockingReasons {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
"{} timelines block for {:?}",
self.timelines, self.reasons
)
}
}
@@ -223,15 +183,13 @@ impl std::fmt::Display for BlockingReasons {
impl BlockingReasons {
fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
let mut reasons = enumset::EnumSet::empty();
g.timelines_blocked.retain(|_key, value| {
g.retain(|_key, value| {
reasons = reasons.union(*value);
!value.is_empty()
});
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
if !g.is_empty() {
Some(BlockingReasons {
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
timelines: g.timelines_blocked.len(),
timelines: g.len(),
reasons,
})
} else {
@@ -240,17 +198,14 @@ impl BlockingReasons {
}
fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
if g.is_empty() {
None
} else {
let reasons = g
.timelines_blocked
.values()
.fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
Some(BlockingReasons {
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
timelines: g.timelines_blocked.len(),
timelines: g.len(),
reasons,
})
}

View File

@@ -949,12 +949,6 @@ impl TenantManager {
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
match attach_conf.generation.cmp(&tenant.generation) {
Ordering::Equal => {
if attach_conf.attach_mode == AttachmentMode::Single {
tenant
.gc_block
.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
}
// A transition from Attached to Attached in the same generation, we may
// take our fast path and just provide the updated configuration
// to the tenant.

View File

@@ -8,15 +8,18 @@ mod layer_desc;
mod layer_name;
pub mod merge_iterator;
use tokio::sync::{self};
use tracing::{debug, Instrument};
use utils::bin_ser::BeSer;
pub mod split_writer;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value;
use crate::repository::{Value, ValueBytes};
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
use pageserver_api::key::Key;
use pageserver_api::key::{Key, DBDIR_KEY};
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use std::cmp::{Ordering, Reverse};
use std::cmp::Ordering;
use std::collections::hash_map::Entry;
use std::collections::{BinaryHeap, HashMap};
use std::ops::Range;
@@ -79,30 +82,57 @@ pub(crate) enum ValueReconstructSituation {
}
/// Reconstruct data accumulated for a single key during a vectored get
#[derive(Debug, Default, Clone)]
#[derive(Debug, Default)]
pub(crate) struct VectoredValueReconstructState {
pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
pub(crate) img: Option<(Lsn, Bytes)>,
pub(crate) records: Vec<(
Lsn,
tokio::sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
)>,
pub(crate) will_init_lsn: Option<Lsn>,
situation: ValueReconstructSituation,
pub(crate) situation: ValueReconstructSituation,
}
impl VectoredValueReconstructState {
fn get_cached_lsn(&self) -> Option<Lsn> {
self.img.as_ref().map(|img| img.0)
self.will_init_lsn
}
}
impl From<VectoredValueReconstructState> for ValueReconstructState {
fn from(mut state: VectoredValueReconstructState) -> Self {
// walredo expects the records to be descending in terms of Lsn
state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
pub(crate) async fn convert(
_key: Key,
from: VectoredValueReconstructState,
) -> Result<ValueReconstructState, PageReconstructError> {
let mut to = ValueReconstructState::default();
ValueReconstructState {
records: state.records,
img: state.img,
for (lsn, fut) in from.records {
match fut.await {
Ok(res) => match res {
Ok(bytes) => {
let value = Value::des(&bytes)
.map_err(|err| PageReconstructError::Other(err.into()))?;
match value {
Value::WalRecord(rec) => {
to.records.push((lsn, rec));
}
Value::Image(img) => {
assert!(to.img.is_none());
to.img = Some((lsn, img));
}
}
}
Err(err) => {
return Err(PageReconstructError::Other(err.into()));
}
},
Err(err) => {
return Err(PageReconstructError::Other(err.into()));
}
}
}
Ok(to)
}
/// Bag of data accumulated during a vectored get..
@@ -119,6 +149,47 @@ pub(crate) struct ValuesReconstructState {
// Statistics that are still accessible as a caller of `get_vectored_impl`.
layers_visited: u32,
delta_layers_visited: u32,
io_concurrency: IoConcurrency,
}
enum IoConcurrency {
Serial {
prev_io: Option<(usize, tokio::task::JoinHandle<()>)>,
},
Parallel,
}
impl IoConcurrency {
pub(crate) fn spawn_io<F>(&mut self, fut: F)
where
F: std::future::Future<Output = ()> + Send + 'static,
{
static IO_COUNTER: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0);
let io_id = IO_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let span = tracing::debug_span!("spawned_io", io_id,);
match self {
IoConcurrency::Serial { prev_io } => {
let prev = prev_io.take();
*prev_io = Some((
io_id,
tokio::spawn(
async move {
if let Some((prev_id, prev_task)) = prev {
debug!(prev_io = prev_id, "Waiting for previous IO to complete");
prev_task.await.unwrap();
}
fut.await;
}
.instrument(span),
),
));
}
IoConcurrency::Parallel => {
tokio::spawn(fut);
}
}
}
}
impl ValuesReconstructState {
@@ -129,9 +200,30 @@ impl ValuesReconstructState {
keys_with_image_coverage: None,
layers_visited: 0,
delta_layers_visited: 0,
io_concurrency: {
static IO_CONCURRENCY: once_cell::sync::Lazy<String> =
once_cell::sync::Lazy::new(|| {
std::env::var("NEON_PAGESERVER_VALUE_RECONSTRUCT_IO_CONCURRENCY").unwrap()
});
match IO_CONCURRENCY.as_str() {
"parallel" => IoConcurrency::Parallel,
"serial" => IoConcurrency::Serial { prev_io: None },
x => panic!(
"Invalid value for NEON_PAGESERVER_VALUE_RECONSTRUCT_IO_CONCURRENCY: {}",
x
),
}
},
}
}
pub(crate) fn spawn_io<F>(&mut self, fut: F)
where
F: std::future::Future<Output = ()> + Send + 'static,
{
self.io_concurrency.spawn_io(fut);
}
/// Associate a key with the error which it encountered and mark it as done
pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
let previous = self.keys.insert(key, Err(err));
@@ -200,7 +292,8 @@ impl ValuesReconstructState {
&mut self,
key: &Key,
lsn: Lsn,
value: Value,
completes: bool,
value: sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
) -> ValueReconstructSituation {
let state = self
.keys
@@ -208,31 +301,16 @@ impl ValuesReconstructState {
.or_insert(Ok(VectoredValueReconstructState::default()));
if let Ok(state) = state {
let key_done = match state.situation {
match state.situation {
ValueReconstructSituation::Complete => unreachable!(),
ValueReconstructSituation::Continue => match value {
Value::Image(img) => {
state.img = Some((lsn, img));
true
}
Value::WalRecord(rec) => {
debug_assert!(
Some(lsn) > state.get_cached_lsn(),
"Attempt to collect a record below cached LSN for walredo: {} < {}",
lsn,
state
.get_cached_lsn()
.expect("Assertion can only fire if a cached lsn is present")
);
ValueReconstructSituation::Continue => {
state.records.push((lsn, value));
}
}
let will_init = rec.will_init();
state.records.push((lsn, rec));
will_init
}
},
};
if key_done && state.situation == ValueReconstructSituation::Continue {
if completes && state.situation == ValueReconstructSituation::Continue {
assert_eq!(state.will_init_lsn, None);
state.will_init_lsn = Some(lsn);
state.situation = ValueReconstructSituation::Complete;
self.keys_done.add_key(*key);
}
@@ -276,20 +354,10 @@ pub(crate) enum LayerId {
InMemoryLayerId(InMemoryLayerFileId),
}
/// Uniquely identify a layer visit by the layer
/// and LSN floor (or start LSN) of the reads.
/// The layer itself is not enough since we may
/// have different LSN lower bounds for delta layer reads.
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
struct LayerToVisitId {
layer_id: LayerId,
lsn_floor: Lsn,
}
/// Layer wrapper for the read path. Note that it is valid
/// to use these layers even after external operations have
/// been performed on them (compaction, freeze, etc.).
#[derive(Debug)]
#[derive(Debug, Clone)]
pub(crate) enum ReadableLayer {
PersistentLayer(Layer),
InMemoryLayer(Arc<InMemoryLayer>),
@@ -297,11 +365,13 @@ pub(crate) enum ReadableLayer {
/// A partial description of a read to be done.
#[derive(Debug, Clone)]
struct LayerVisit {
struct ReadDesc {
/// An id used to resolve the readable layer within the fringe
layer_to_visit_id: LayerToVisitId,
layer_id: LayerId,
/// Lsn range for the read, used for selecting the next read
lsn_range: Range<Lsn>,
/// This read's index in [`LayerKeyspace::target_keyspace`];
read_id: LayerKeyspaceReadId,
}
/// Data structure which maintains a fringe of layers for the
@@ -313,46 +383,52 @@ struct LayerVisit {
/// a two layer indexing scheme.
#[derive(Debug)]
pub(crate) struct LayerFringe {
planned_visits_by_lsn: BinaryHeap<LayerVisit>,
visit_reads: HashMap<LayerToVisitId, LayerVisitReads>,
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
layers: HashMap<LayerId, LayerKeyspace>,
}
#[derive(Debug)]
struct LayerVisitReads {
struct LayerKeyspace {
layer: ReadableLayer,
target_keyspace: KeySpaceRandomAccum,
next_read_id: LayerKeyspaceReadId,
reads: HashMap<LayerKeyspaceReadId, (Range<Lsn>, KeySpace)>,
}
#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
struct LayerKeyspaceReadId(usize);
impl LayerFringe {
pub(crate) fn new() -> Self {
LayerFringe {
planned_visits_by_lsn: BinaryHeap::new(),
visit_reads: HashMap::new(),
planned_reads_by_lsn: BinaryHeap::new(),
layers: HashMap::new(),
}
}
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
let read_desc = match self.planned_visits_by_lsn.pop() {
let read_desc = match self.planned_reads_by_lsn.pop() {
Some(desc) => desc,
None => return None,
};
let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id);
let mut entry = match self.layers.entry(read_desc.layer_id) {
Entry::Occupied(o) => o,
Entry::Vacant(_) => unreachable!("fringe internals are always consistent"),
};
match removed {
Some((
_,
LayerVisitReads {
layer,
mut target_keyspace,
},
)) => Some((
layer,
target_keyspace.consume_keyspace(),
read_desc.lsn_range,
)),
None => unreachable!("fringe internals are always consistent"),
let (lsn_range, keyspace) = entry
.get_mut()
.reads
.remove(&read_desc.read_id)
.expect("fringe internals are always consistent");
let layer = entry.get().layer.clone();
if entry.get().reads.is_empty() {
entry.remove();
}
Some((layer, keyspace, lsn_range))
}
pub(crate) fn update(
@@ -361,26 +437,35 @@ impl LayerFringe {
keyspace: KeySpace,
lsn_range: Range<Lsn>,
) {
let layer_to_visit_id = LayerToVisitId {
layer_id: layer.id(),
lsn_floor: lsn_range.start,
};
let entry = self.visit_reads.entry(layer_to_visit_id.clone());
let layer_id = layer.id();
let entry = self.layers.entry(layer_id.clone());
match entry {
Entry::Occupied(mut entry) => {
entry.get_mut().target_keyspace.add_keyspace(keyspace);
let read_id = {
let r = &mut entry.get_mut().next_read_id;
let read_id = *r;
*r = LayerKeyspaceReadId(r.0 + 1);
read_id
};
self.planned_reads_by_lsn.push(ReadDesc {
lsn_range: lsn_range.clone(),
layer_id: layer_id.clone(),
read_id,
});
let replaced = entry.get_mut().reads.insert(read_id, (lsn_range, keyspace));
assert!(replaced.is_none());
}
Entry::Vacant(entry) => {
self.planned_visits_by_lsn.push(LayerVisit {
lsn_range,
layer_to_visit_id: layer_to_visit_id.clone(),
let read_id = LayerKeyspaceReadId(0);
self.planned_reads_by_lsn.push(ReadDesc {
lsn_range: lsn_range.clone(),
layer_id: layer_id.clone(),
read_id,
});
let mut accum = KeySpaceRandomAccum::new();
accum.add_keyspace(keyspace);
entry.insert(LayerVisitReads {
entry.insert(LayerKeyspace {
layer,
target_keyspace: accum,
next_read_id: LayerKeyspaceReadId(1),
reads: [(read_id, (lsn_range, keyspace))].into(),
});
}
}
@@ -393,7 +478,7 @@ impl Default for LayerFringe {
}
}
impl Ord for LayerVisit {
impl Ord for ReadDesc {
fn cmp(&self, other: &Self) -> Ordering {
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
if ord == std::cmp::Ordering::Equal {
@@ -404,19 +489,19 @@ impl Ord for LayerVisit {
}
}
impl PartialOrd for LayerVisit {
impl PartialOrd for ReadDesc {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for LayerVisit {
impl PartialEq for ReadDesc {
fn eq(&self, other: &Self) -> bool {
self.lsn_range == other.lsn_range
}
}
impl Eq for LayerVisit {}
impl Eq for ReadDesc {}
impl ReadableLayer {
pub(crate) fn id(&self) -> LayerId {

View File

@@ -42,13 +42,12 @@ use crate::tenant::vectored_blob_io::{
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
VectoredReadCoalesceMode, VectoredReadPlanner,
};
use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
use crate::virtual_file::{self, VirtualFile};
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::BytesMut;
use anyhow::{bail, ensure, Context, Result};
use bytes::{Bytes, BytesMut};
use camino::{Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use itertools::Itertools;
@@ -58,14 +57,14 @@ use pageserver_api::models::ImageCompressionAlgorithm;
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::collections::{HashMap, VecDeque};
use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::OnceCell;
use tokio::sync::{self, OnceCell};
use tokio_epoll_uring::IoBuf;
use tracing::*;
@@ -224,7 +223,7 @@ pub struct DeltaLayerInner {
index_start_blk: u32,
index_root_blk: u32,
file: VirtualFile,
file: Arc<VirtualFile>,
file_id: FileId,
layer_key_range: Range<Key>,
@@ -788,9 +787,11 @@ impl DeltaLayerInner {
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path, ctx)
.await
.context("open layer file")?;
let file = Arc::new(
VirtualFile::open(path, ctx)
.await
.context("open layer file")?,
);
let file_id = page_cache::next_file_id();
@@ -841,6 +842,7 @@ impl DeltaLayerInner {
// can be further optimised to visit the index only once.
pub(super) async fn get_values_reconstruct_data(
&self,
self_desc: PersistentLayerDesc,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValuesReconstructState,
@@ -863,6 +865,7 @@ impl DeltaLayerInner {
let data_end_offset = self.index_start_offset();
let reads = Self::plan_reads(
self_desc,
&keyspace,
lsn_range.clone(),
data_end_offset,
@@ -883,6 +886,7 @@ impl DeltaLayerInner {
}
async fn plan_reads<Reader>(
self_desc: PersistentLayerDesc,
keyspace: &KeySpace,
lsn_range: Range<Lsn>,
data_end_offset: u64,
@@ -911,6 +915,8 @@ impl DeltaLayerInner {
let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
let blob_ref = BlobRef(value);
debug!(file = %self_desc.layer_name(), %key, %lsn, will_init = blob_ref.will_init(), "delta layer found key");
// Lsns are not monotonically increasing across keys, so we don't assert on them.
assert!(key >= range.start);
@@ -933,7 +939,7 @@ impl DeltaLayerInner {
range_end_handled = true;
break;
} else {
planner.handle(key, lsn, blob_ref.pos(), flag);
planner.handle(key, lsn, blob_ref.pos(), flag, blob_ref.will_init());
}
}
@@ -980,77 +986,59 @@ impl DeltaLayerInner {
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) {
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
let mut ignore_key_with_err = None;
let max_vectored_read_bytes = self
.max_vectored_read_bytes
.expect("Layer is loaded with max vectored bytes config")
.0
.into();
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
let mut buf = Some(BytesMut::with_capacity(buf_size));
// Note that reads are processed in reverse order (from highest key+lsn).
// This is the order that `ReconstructState` requires such that it can
// track when a key is done.
for read in reads.into_iter().rev() {
let res = vectored_blob_reader
.read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
.await;
let blobs_buf = match res {
Ok(blobs_buf) => blobs_buf,
Err(err) => {
let kind = err.kind();
for (_, blob_meta) in read.blobs_at.as_slice() {
reconstruct_state.on_key_error(
blob_meta.key,
PageReconstructError::Other(anyhow!(
"Failed to read blobs from virtual file {}: {}",
self.file.path,
kind
)),
);
}
// We have "lost" the buffer since the lower level IO api
// doesn't return the buffer on error. Allocate a new one.
buf = Some(BytesMut::with_capacity(buf_size));
continue;
}
};
for meta in blobs_buf.blobs.iter().rev() {
if Some(meta.meta.key) == ignore_key_with_err {
continue;
}
let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
let value = match value {
Ok(v) => v,
Err(e) => {
reconstruct_state.on_key_error(
meta.meta.key,
PageReconstructError::Other(anyhow!(e).context(format!(
"Failed to deserialize blob from virtual file {}",
self.file.path,
))),
);
ignore_key_with_err = Some(meta.meta.key);
continue;
}
};
// Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
// state, no further updates shall be made to it. The call below will
// panic if the invariant is violated.
reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
let mut senders: HashMap<
(Key, Lsn),
sync::oneshot::Sender<Result<Bytes, std::io::Error>>,
> = Default::default();
for (_, blob_meta) in read.blobs_at.as_slice().iter().rev() {
let (tx, rx) = sync::oneshot::channel();
senders.insert((blob_meta.key, blob_meta.lsn), tx);
reconstruct_state.update_key(
&blob_meta.key,
blob_meta.lsn,
blob_meta.will_init,
rx,
);
}
buf = Some(blobs_buf.buf);
let read_from = self.file.clone();
let read_ctx = ctx.attached_child();
reconstruct_state.spawn_io(async move {
let vectored_blob_reader = VectoredBlobReader::new(&read_from);
let buf = BytesMut::with_capacity(buf_size);
let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
match res {
Ok(blobs_buf) => {
for meta in blobs_buf.blobs.iter().rev() {
let buf = &blobs_buf.buf[meta.start..meta.end];
let sender = senders
.remove(&(meta.meta.key, meta.meta.lsn))
.expect("sender must exist");
let _ = sender.send(Ok(Bytes::copy_from_slice(buf)));
}
assert!(senders.is_empty());
}
Err(err) => {
for (_, sender) in senders {
let _ = sender
.send(Err(std::io::Error::new(err.kind(), "vec read failed")));
}
}
}
});
}
}
@@ -1190,7 +1178,14 @@ impl DeltaLayerInner {
let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
let end_offset = offset;
Some((BlobMeta { key, lsn }, start_offset..end_offset))
Some((
BlobMeta {
key,
lsn,
will_init: false,
},
start_offset..end_offset,
))
} else {
None
};

View File

@@ -21,7 +21,7 @@
//!
//! Every image layer file consists of three parts: "summary",
//! "index", and "values". The summary is a fixed size header at the
//! beginning of the file, and it contains basic information about the
//! beginningof the file, and it contains basic information about the
//! layer, and offsets to the other parts. The "index" is a B-tree,
//! mapping from Key to an offset in the "values" part. The
//! actual page images are stored in the "values" part.
@@ -38,11 +38,11 @@ use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
};
use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::virtual_file::{self, VirtualFile};
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{anyhow, bail, ensure, Context, Result};
use anyhow::{bail, ensure, Context, Result};
use bytes::{Bytes, BytesMut};
use camino::{Utf8Path, Utf8PathBuf};
use hex;
@@ -52,12 +52,14 @@ use pageserver_api::keyspace::KeySpace;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::collections::{HashMap, VecDeque};
use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
use std::os::unix::prelude::FileExt;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::oneshot;
use tokio::sync::OnceCell;
use tokio_stream::StreamExt;
use tracing::*;
@@ -69,7 +71,9 @@ use utils::{
};
use super::layer_name::ImageLayerName;
use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
use super::{
AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
};
///
/// Header stored in the beginning of the file
@@ -160,7 +164,7 @@ pub struct ImageLayerInner {
key_range: Range<Key>,
lsn: Lsn,
file: VirtualFile,
file: Arc<VirtualFile>,
file_id: FileId,
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
@@ -387,9 +391,11 @@ impl ImageLayerInner {
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path, ctx)
.await
.context("open layer file")?;
let file = Arc::new(
VirtualFile::open(path, ctx)
.await
.context("open layer file")?,
);
let file_id = page_cache::next_file_id();
let block_reader = FileBlockReader::new(&file, file_id);
let summary_blk = block_reader
@@ -435,12 +441,13 @@ impl ImageLayerInner {
// the reconstruct state with whatever is found.
pub(super) async fn get_values_reconstruct_data(
&self,
self_desc: PersistentLayerDesc,
keyspace: KeySpace,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
let reads = self
.plan_reads(keyspace, None, ctx)
.plan_reads(self_desc, keyspace, None, ctx)
.await
.map_err(GetVectoredError::Other)?;
@@ -459,6 +466,7 @@ impl ImageLayerInner {
/// this shard.
async fn plan_reads(
&self,
self_desc: PersistentLayerDesc,
keyspace: KeySpace,
shard_identity: Option<&ShardIdentity>,
ctx: &RequestContext,
@@ -502,12 +510,14 @@ impl ImageLayerInner {
BlobFlag::None
};
debug!(file = %self_desc.layer_name(), %key, %self.lsn, will_init=true, "image layer found key");
if key >= range.end {
planner.handle_range_end(offset);
range_end_handled = true;
break;
} else {
planner.handle(key, self.lsn, offset, flag);
planner.handle(key, self.lsn, offset, flag, true);
}
}
@@ -531,6 +541,7 @@ impl ImageLayerInner {
// Fragment the range into the regions owned by this ShardIdentity
let plan = self
.plan_reads(
todo!(),
KeySpace {
// If asked for the total key space, plan_reads will give us all the keys in the layer
ranges: vec![Key::MIN..Key::MAX],
@@ -576,8 +587,16 @@ impl ImageLayerInner {
.0
.into();
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
for read in reads.into_iter() {
let mut senders: HashMap<(Key, Lsn), oneshot::Sender<Result<Bytes, std::io::Error>>> =
Default::default();
for (_, blob_meta) in read.blobs_at.as_slice() {
let (tx, rx) = oneshot::channel();
senders.insert((blob_meta.key, blob_meta.lsn), tx);
reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true, rx);
}
let buf_size = read.size();
if buf_size > max_vectored_read_bytes {
@@ -596,36 +615,36 @@ impl ImageLayerInner {
);
}
let buf = BytesMut::with_capacity(buf_size);
let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
let read_from = self.file.clone();
let read_ctx = ctx.attached_child();
reconstruct_state.spawn_io(async move {
let buf = BytesMut::with_capacity(buf_size);
let vectored_blob_reader = VectoredBlobReader::new(&read_from);
let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
match res {
Ok(blobs_buf) => {
let frozen_buf = blobs_buf.buf.freeze();
match res {
Ok(blobs_buf) => {
for meta in blobs_buf.blobs.iter() {
let buf = &blobs_buf.buf[meta.start..meta.end];
let sender = senders
.remove(&(meta.meta.key, meta.meta.lsn))
.expect("sender must exist");
// TODO: this is silly - sort it out
let bytes = Value::ser(&Value::Image(Bytes::copy_from_slice(buf)))
.expect("stupid but correct");
let _ = sender.send(Ok(bytes.into()));
}
for meta in blobs_buf.blobs.iter() {
let img_buf = frozen_buf.slice(meta.start..meta.end);
reconstruct_state.update_key(
&meta.meta.key,
self.lsn,
Value::Image(img_buf),
);
assert!(senders.is_empty());
}
Err(err) => {
for (_, sender) in senders {
let _ = sender
.send(Err(std::io::Error::new(err.kind(), "vec read failed")));
}
}
}
Err(err) => {
let kind = err.kind();
for (_, blob_meta) in read.blobs_at.as_slice() {
reconstruct_state.on_key_error(
blob_meta.key,
PageReconstructError::from(anyhow!(
"Failed to read blobs from virtual file {}: {}",
self.file.path,
kind
)),
);
}
}
};
});
}
}
@@ -797,9 +816,10 @@ impl ImageLayerWriterInner {
///
async fn finish(
self,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
end_key: Option<Key>,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
) -> anyhow::Result<ResidentLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -875,9 +895,12 @@ impl ImageLayerWriterInner {
// fsync the file
file.sync_all().await?;
trace!("created image layer {}", self.path);
// FIXME: why not carry the virtualfile here, it supports renaming?
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
Ok((desc, self.path))
info!("created image layer {}", layer.local_path());
Ok(layer)
}
}
@@ -956,18 +979,24 @@ impl ImageLayerWriter {
///
pub(crate) async fn finish(
mut self,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
self.inner.take().unwrap().finish(ctx, None).await
) -> anyhow::Result<super::ResidentLayer> {
self.inner.take().unwrap().finish(timeline, ctx, None).await
}
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
pub(super) async fn finish_with_end_key(
mut self,
timeline: &Arc<Timeline>,
end_key: Key,
ctx: &RequestContext,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
self.inner.take().unwrap().finish(ctx, Some(end_key)).await
) -> anyhow::Result<super::ResidentLayer> {
self.inner
.take()
.unwrap()
.finish(timeline, ctx, Some(end_key))
.await
}
}
@@ -1071,7 +1100,7 @@ mod test {
tenant::{
config::TenantConf,
harness::{TenantHarness, TIMELINE_ID},
storage_layer::{Layer, ResidentLayer},
storage_layer::ResidentLayer,
vectored_blob_io::StreamingVectoredReadPlanner,
Tenant, Timeline,
},
@@ -1142,8 +1171,7 @@ mod test {
key = key.next();
}
let (desc, path) = writer.finish(&ctx).await.unwrap();
Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap()
writer.finish(&timeline, &ctx).await.unwrap()
};
let original_size = resident.metadata().file_size;
@@ -1205,9 +1233,7 @@ mod test {
.await
.unwrap();
let replacement = if wrote_keys > 0 {
let (desc, path) = filtered_writer.finish(&ctx).await.unwrap();
let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap();
Some(resident)
Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
} else {
None
};
@@ -1280,8 +1306,7 @@ mod test {
for (key, img) in images {
writer.put_image(key, img, ctx).await?;
}
let (desc, path) = writer.finish(ctx).await?;
let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
let img_layer = writer.finish(tline, ctx).await?;
Ok::<_, anyhow::Error>(img_layer)
}

View File

@@ -10,10 +10,9 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::repository::{Key, Value};
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::{l0_flush, page_cache};
use anyhow::{anyhow, Context, Result};
use anyhow::{Context, Result};
use bytes::Bytes;
use camino::Utf8PathBuf;
use pageserver_api::key::CompactKey;
@@ -35,9 +34,7 @@ use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::atomic::{AtomicU64, AtomicUsize};
use tokio::sync::RwLock;
use super::{
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
};
use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
pub(crate) mod vectored_dio_read;
@@ -87,7 +84,7 @@ pub struct InMemoryLayerInner {
/// The values are stored in a serialized format in this file.
/// Each serialized Value is preceded by a 'u32' length field.
/// PerSeg::page_versions map stores offsets into this file.
file: EphemeralFile,
file: Arc<tokio::sync::RwLock<EphemeralFile>>,
resource_units: GlobalResourceUnits,
}
@@ -381,7 +378,11 @@ impl InMemoryLayer {
}
pub(crate) fn try_len(&self) -> Option<u64> {
self.inner.try_read().map(|i| i.file.len()).ok()
self.inner
.try_read()
.map(|i| i.file.try_read().map(|i| i.len()).ok())
.ok()
.flatten()
}
pub(crate) fn assert_writable(&self) {
@@ -432,6 +433,10 @@ impl InMemoryLayer {
read: vectored_dio_read::LogicalRead<Vec<u8>>,
}
let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
let mut senders: HashMap<
(Key, Lsn),
tokio::sync::oneshot::Sender<Result<Bytes, std::io::Error>>,
> = Default::default();
for range in keyspace.ranges.iter() {
for (key, vec_map) in inner
@@ -459,6 +464,13 @@ impl InMemoryLayer {
Vec::with_capacity(len as usize),
),
});
let (tx, rx) = tokio::sync::oneshot::channel();
senders.insert((key, *entry_lsn), tx);
reconstruct_state.update_key(&key, *entry_lsn, will_init, rx);
debug!(%key, %entry_lsn, will_init, "inmemory layer found key");
if will_init {
break;
}
@@ -466,46 +478,39 @@ impl InMemoryLayer {
}
}
// Execute the reads.
let read_from = inner.file.clone();
let read_ctx = ctx.attached_child();
reconstruct_state.spawn_io(async move {
let locked = read_from.read().await;
let f = vectored_dio_read::execute(
&*locked,
reads
.iter()
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
&read_ctx,
);
send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
.await;
let f = vectored_dio_read::execute(
&inner.file,
reads
.iter()
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
&ctx,
);
send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
.await;
// Process results into the reconstruct state
'next_key: for (key, value_reads) in reads {
for ValueRead { entry_lsn, read } in value_reads {
match read.into_result().expect("we run execute() above") {
Err(e) => {
reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
continue 'next_key;
}
Ok(value_buf) => {
let value = Value::des(&value_buf);
if let Err(e) = value {
reconstruct_state
.on_key_error(key, PageReconstructError::from(anyhow!(e)));
continue 'next_key;
for (key, value_reads) in reads {
for ValueRead { entry_lsn, read } in value_reads {
let sender = senders
.remove(&(key, entry_lsn))
.expect("sender must exist");
match read.into_result().expect("we run execute() above") {
Err(e) => {
let _ = sender
.send(Err(std::io::Error::new(e.kind(), "dio vec read failed")));
}
let key_situation =
reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
if key_situation == ValueReconstructSituation::Complete {
// TODO: metric to see if we fetched more values than necessary
continue 'next_key;
Ok(value_buf) => {
let _ = sender.send(Ok(value_buf.into()));
}
// process the next value in the next iteration of the loop
}
}
}
}
assert!(senders.is_empty());
});
reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
@@ -600,7 +605,8 @@ impl InMemoryLayer {
/// Get layer size.
pub async fn size(&self) -> Result<u64> {
let inner = self.inner.read().await;
Ok(inner.file.len())
let locked = inner.file.try_read().expect("no contention");
Ok(locked.len())
}
/// Create a new, empty, in-memory layer
@@ -614,9 +620,10 @@ impl InMemoryLayer {
) -> Result<InMemoryLayer> {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file =
EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
let key = InMemoryLayerFileId(file.page_cache_file_id());
let file = Arc::new(tokio::sync::RwLock::new(
EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?,
));
let key = InMemoryLayerFileId(file.read().await.page_cache_file_id());
Ok(InMemoryLayer {
file_id: key,
@@ -648,7 +655,7 @@ impl InMemoryLayer {
let mut inner = self.inner.write().await;
self.assert_writable();
let base_offset = inner.file.len();
let base_offset = inner.file.read().await.len();
let SerializedBatch {
raw,
@@ -672,8 +679,13 @@ impl InMemoryLayer {
}
// Write the batch to the file
inner.file.write_raw(&raw, ctx).await?;
let new_size = inner.file.len();
// FIXME: can't borrow arc
let new_size = {
let mut locked = inner.file.write().await;
locked.write_raw(&raw, ctx).await?;
locked.len()
};
let expected_new_len = base_offset
.checked_add(raw.len().into_u64())
// write_raw would error if we were to overflow u64.
@@ -713,7 +725,7 @@ impl InMemoryLayer {
pub(crate) async fn tick(&self) -> Option<u64> {
let mut inner = self.inner.write().await;
let size = inner.file.len();
let size = inner.file.read().await.len();
inner.resource_units.publish_size(size)
}
@@ -809,7 +821,7 @@ impl InMemoryLayer {
match l0_flush_global_state {
l0_flush::Inner::Direct { .. } => {
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
let file_contents: Vec<u8> = inner.file.read().await.load_to_vec(ctx).await?;
let file_contents = Bytes::from(file_contents);

View File

@@ -439,30 +439,11 @@ impl Layer {
fn record_access(&self, ctx: &RequestContext) {
if self.0.access_stats.record_access(ctx) {
// Visibility was modified to Visible: maybe log about this
match ctx.task_kind() {
TaskKind::CalculateSyntheticSize
| TaskKind::GarbageCollector
| TaskKind::MgmtRequest => {
// This situation is expected in code paths do binary searches of the LSN space to resolve
// an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
// and on-demand for certain HTTP API requests.
}
_ => {
// In all other contexts, it is unusual to do I/O involving layers which are not visible at
// some branch tip, so we log the fact that we are accessing something that the visibility
// calculation thought should not be visible.
//
// This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
// which was covered by a concurrent compaction.
tracing::info!(
"Layer {} became visible as a result of access",
self.0.desc.key()
);
}
}
// Update the timeline's visible bytes count
// Visibility was modified to Visible
tracing::info!(
"Layer {} became visible as a result of access",
self.0.desc.key()
);
if let Some(tl) = self.0.timeline.upgrade() {
tl.metrics
.visible_physical_size_gauge
@@ -1774,11 +1755,17 @@ impl DownloadedLayer {
.map_err(GetVectoredError::Other)?
{
Delta(d) => {
d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
.await
d.get_values_reconstruct_data(
owner.desc.clone(),
keyspace,
lsn_range,
reconstruct_data,
ctx,
)
.await
}
Image(i) => {
i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
i.get_values_reconstruct_data(owner.desc.clone(), keyspace, reconstruct_data, ctx)
.await
}
}

View File

@@ -107,6 +107,8 @@ async fn smoke_test() {
.expect("tenant harness writes the control file")
};
let img_before = (img_before.0, img_before.1.await.unwrap().unwrap());
let img_after = (img_after.0, img_after.1.await.unwrap().unwrap());
assert_eq!(img_before, img_after);
// evict_and_wait can timeout, but it doesn't cancel the evicting itself
@@ -1025,15 +1027,6 @@ fn access_stats() {
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
access_stats.set_visibility(LayerVisibilityHint::Visible);
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
// Recording access implicitly makes layer visible, if it wasn't already
let atime = UNIX_EPOCH + Duration::from_secs(2200000000);
access_stats.set_visibility(LayerVisibilityHint::Covered);
assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
assert!(access_stats.record_access_at(atime));
access_stats.set_visibility(LayerVisibilityHint::Visible);
assert!(!access_stats.record_access_at(atime));
access_stats.set_visibility(LayerVisibilityHint::Visible);
}
#[test]

View File

@@ -121,11 +121,11 @@ impl SplitImageLayerWriter {
self.generated_layers
.push(SplitWriterResult::Discarded(layer_key));
} else {
let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
self.generated_layers
.push(SplitWriterResult::Produced(layer));
self.generated_layers.push(SplitWriterResult::Produced(
prev_image_writer
.finish_with_end_key(tline, key, ctx)
.await?,
));
}
}
self.inner.put_image(key, img, ctx).await
@@ -170,9 +170,9 @@ impl SplitImageLayerWriter {
if discard(&layer_key).await {
generated_layers.push(SplitWriterResult::Discarded(layer_key));
} else {
let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
generated_layers.push(SplitWriterResult::Produced(layer));
generated_layers.push(SplitWriterResult::Produced(
inner.finish_with_end_key(tline, end_key, ctx).await?,
));
}
Ok(generated_layers)
}
@@ -188,7 +188,7 @@ impl SplitImageLayerWriter {
.await
}
/// This function will be deprecated with #8841.
/// When split writer fails, the caller should call this function and handle partially generated layers.
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
Ok((self.generated_layers, self.inner))
}
@@ -204,7 +204,7 @@ impl SplitImageLayerWriter {
/// will split them into multiple files based on size.
#[must_use]
pub struct SplitDeltaLayerWriter {
inner: Option<(Key, DeltaLayerWriter)>,
inner: DeltaLayerWriter,
target_layer_size: u64,
generated_layers: Vec<SplitWriterResult>,
conf: &'static PageServerConf,
@@ -212,6 +212,7 @@ pub struct SplitDeltaLayerWriter {
tenant_shard_id: TenantShardId,
lsn_range: Range<Lsn>,
last_key_written: Key,
start_key: Key,
}
impl SplitDeltaLayerWriter {
@@ -219,18 +220,29 @@ impl SplitDeltaLayerWriter {
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
start_key: Key,
lsn_range: Range<Lsn>,
target_layer_size: u64,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
Ok(Self {
target_layer_size,
inner: None,
inner: DeltaLayerWriter::new(
conf,
timeline_id,
tenant_shard_id,
start_key,
lsn_range.clone(),
ctx,
)
.await?,
generated_layers: Vec::new(),
conf,
timeline_id,
tenant_shard_id,
lsn_range,
last_key_written: Key::MIN,
start_key,
})
}
@@ -253,26 +265,9 @@ impl SplitDeltaLayerWriter {
//
// Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
// strategy. https://github.com/neondatabase/neon/issues/8837
if self.inner.is_none() {
self.inner = Some((
key,
DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
key,
self.lsn_range.clone(),
ctx,
)
.await?,
));
}
let (_, inner) = self.inner.as_mut().unwrap();
let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
if inner.num_keys() >= 1
&& inner.estimated_size() + addition_size_estimation >= self.target_layer_size
if self.inner.num_keys() >= 1
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
{
if key != self.last_key_written {
let next_delta_writer = DeltaLayerWriter::new(
@@ -284,13 +279,13 @@ impl SplitDeltaLayerWriter {
ctx,
)
.await?;
let (start_key, prev_delta_writer) =
std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
let layer_key = PersistentLayerKey {
key_range: start_key..key,
key_range: self.start_key..key,
lsn_range: self.lsn_range.clone(),
is_delta: true,
};
self.start_key = key;
if discard(&layer_key).await {
drop(prev_delta_writer);
self.generated_layers
@@ -301,18 +296,17 @@ impl SplitDeltaLayerWriter {
self.generated_layers
.push(SplitWriterResult::Produced(delta_layer));
}
} else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
} else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
// We have to produce a very large file b/c a key is updated too often.
anyhow::bail!(
"a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
key,
inner.estimated_size()
self.inner.estimated_size()
);
}
}
self.last_key_written = key;
let (_, inner) = self.inner.as_mut().unwrap();
inner.put_value(key, lsn, val, ctx).await
self.inner.put_value(key, lsn, val, ctx).await
}
pub async fn put_value(
@@ -331,6 +325,7 @@ impl SplitDeltaLayerWriter {
self,
tline: &Arc<Timeline>,
ctx: &RequestContext,
end_key: Key,
discard: D,
) -> anyhow::Result<Vec<SplitWriterResult>>
where
@@ -342,15 +337,11 @@ impl SplitDeltaLayerWriter {
inner,
..
} = self;
let Some((start_key, inner)) = inner else {
return Ok(generated_layers);
};
if inner.num_keys() == 0 {
return Ok(generated_layers);
}
let end_key = self.last_key_written.next();
let layer_key = PersistentLayerKey {
key_range: start_key..end_key,
key_range: self.start_key..end_key,
lsn_range: self.lsn_range.clone(),
is_delta: true,
};
@@ -369,14 +360,15 @@ impl SplitDeltaLayerWriter {
self,
tline: &Arc<Timeline>,
ctx: &RequestContext,
end_key: Key,
) -> anyhow::Result<Vec<SplitWriterResult>> {
self.finish_with_discard_fn(tline, ctx, |_| async { false })
self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
.await
}
/// This function will be deprecated with #8841.
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
Ok((self.generated_layers, self.inner.map(|x| x.1)))
/// When split writer fails, the caller should call this function and handle partially generated layers.
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
Ok((self.generated_layers, self.inner))
}
}
@@ -440,8 +432,10 @@ mod tests {
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18)..Lsn(0x20),
4 * 1024 * 1024,
&ctx,
)
.await
.unwrap();
@@ -466,22 +460,11 @@ mod tests {
)
.await
.unwrap();
let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
let layers = delta_writer
.finish(&tline, &ctx, get_key(10))
.await
.unwrap();
assert_eq!(layers.len(), 1);
assert_eq!(
layers
.into_iter()
.next()
.unwrap()
.into_resident_layer()
.layer_desc()
.key(),
PersistentLayerKey {
key_range: get_key(0)..get_key(1),
lsn_range: Lsn(0x18)..Lsn(0x20),
is_delta: true
}
);
}
#[tokio::test]
@@ -518,8 +501,10 @@ mod tests {
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18)..Lsn(0x20),
4 * 1024 * 1024,
&ctx,
)
.await
.unwrap();
@@ -548,7 +533,10 @@ mod tests {
.finish(&tline, &ctx, get_key(N as u32))
.await
.unwrap();
let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
let delta_layers = delta_writer
.finish(&tline, &ctx, get_key(N as u32))
.await
.unwrap();
if discard {
for layer in image_layers {
layer.into_discarded_layer();
@@ -567,14 +555,6 @@ mod tests {
.collect_vec();
assert_eq!(image_layers.len(), N / 512 + 1);
assert_eq!(delta_layers.len(), N / 512 + 1);
assert_eq!(
delta_layers.first().unwrap().layer_desc().key_range.start,
get_key(0)
);
assert_eq!(
delta_layers.last().unwrap().layer_desc().key_range.end,
get_key(N as u32)
);
for idx in 0..image_layers.len() {
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
@@ -622,8 +602,10 @@ mod tests {
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18)..Lsn(0x20),
4 * 1024,
&ctx,
)
.await
.unwrap();
@@ -662,35 +644,11 @@ mod tests {
)
.await
.unwrap();
let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
let layers = delta_writer
.finish(&tline, &ctx, get_key(10))
.await
.unwrap();
assert_eq!(layers.len(), 2);
let mut layers_iter = layers.into_iter();
assert_eq!(
layers_iter
.next()
.unwrap()
.into_resident_layer()
.layer_desc()
.key(),
PersistentLayerKey {
key_range: get_key(0)..get_key(1),
lsn_range: Lsn(0x18)..Lsn(0x20),
is_delta: true
}
);
assert_eq!(
layers_iter
.next()
.unwrap()
.into_resident_layer()
.layer_desc()
.key(),
PersistentLayerKey {
key_range: get_key(1)..get_key(2),
lsn_range: Lsn(0x18)..Lsn(0x20),
is_delta: true
}
);
}
#[tokio::test]
@@ -710,8 +668,10 @@ mod tests {
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
4 * 1024 * 1024,
&ctx,
)
.await
.unwrap();
@@ -729,20 +689,10 @@ mod tests {
.await
.unwrap();
}
let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
let delta_layers = delta_writer
.finish(&tline, &ctx, get_key(N as u32))
.await
.unwrap();
assert_eq!(delta_layers.len(), 1);
let delta_layer = delta_layers
.into_iter()
.next()
.unwrap()
.into_resident_layer();
assert_eq!(
delta_layer.layer_desc().key(),
PersistentLayerKey {
key_range: get_key(0)..get_key(1),
lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
is_delta: true
}
);
}
}

View File

@@ -163,6 +163,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
// How many errors we have seen consequtively
let mut error_run_count = 0;
let mut last_throttle_flag_reset_at = Instant::now();
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -189,6 +191,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
}
let sleep_duration;
if period == Duration::ZERO {
#[cfg(not(feature = "testing"))]
@@ -203,18 +207,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
};
// Run compaction
let IterationResult { output, elapsed } = iteration
.run(tenant.compaction_iteration(&cancel, &ctx))
.await;
let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
match output {
Ok(has_pending_task) => {
error_run_count = 0;
// schedule the next compaction immediately in case there is a pending compaction task
sleep_duration = if has_pending_task {
Duration::ZERO
} else {
period
};
sleep_duration = if has_pending_task { Duration::ZERO } else { period };
}
Err(e) => {
let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -235,20 +233,38 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
// the duration is recorded by performance tests by enabling debug in this function
tracing::debug!(
elapsed_ms = elapsed.as_millis(),
"compaction iteration complete"
);
tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
};
// Perhaps we did no work and the walredo process has been idle for some time:
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
// TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
if let Some(walredo_mgr) = &tenant.walredo_mgr {
walredo_mgr.maybe_quiesce(period * 10);
}
// TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
let now = Instant::now();
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
if count_throttled == 0 {
return;
}
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
let delta = now - prev;
info!(
n_seconds=%format_args!("{:.3}",
delta.as_secs_f64()),
count_accounted,
count_throttled,
sum_throttled_usecs,
allowed_rps=%format_args!("{allowed_rps:.0}"),
"shard was throttled in the last n_seconds"
);
});
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await
@@ -330,7 +346,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let mut first = true;
tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
loop {
tokio::select! {
_ = cancel.cancelled() => {
@@ -348,6 +363,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
first = false;
let delays = async {
delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
random_init_delay(period, &cancel).await?;
Ok::<_, Cancelled>(())
};
@@ -421,7 +437,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let mut last_throttle_flag_reset_at = Instant::now();
loop {
tokio::select! {
_ = cancel.cancelled() => {
@@ -468,29 +483,6 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
kind: BackgroundLoopKind::IngestHouseKeeping,
};
iteration.run(tenant.ingest_housekeeping()).await;
// TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
// Or just spawn another background loop for this throttle, it's not like it's super costly.
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
let now = Instant::now();
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
if count_throttled == 0 {
return;
}
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
let delta = now - prev;
info!(
n_seconds=%format_args!("{:.3}",
delta.as_secs_f64()),
count_accounted = count_accounted_finish, // don't break existing log scraping
count_throttled,
sum_throttled_usecs,
count_accounted_start, // log after pre-existing fields to not break existing log scraping
allowed_rps=%format_args!("{allowed_rps:.0}"),
"shard was throttled in the last n_seconds"
);
});
}
}
.await;
@@ -546,12 +538,28 @@ pub(crate) async fn random_init_delay(
let mut rng = rand::thread_rng();
rng.gen_range(Duration::ZERO..=period)
};
match tokio::time::timeout(d, cancel.cancelled()).await {
Ok(_) => Err(Cancelled),
Err(_) => Ok(()),
}
}
/// Delays GC by defaul lease length at restart.
///
/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
/// length, we gurantees that all the leases we granted before the restart will expire
/// when we run GC for the first time after the restart.
pub(crate) async fn delay_by_lease_length(
length: Duration,
cancel: &CancellationToken,
) -> Result<(), Cancelled> {
match tokio::time::timeout(length, cancel.cancelled()).await {
Ok(_) => Err(Cancelled),
Err(_) => Ok(()),
}
}
struct Iteration {
started_at: Instant,
period: Duration,

View File

@@ -24,10 +24,8 @@ use crate::{context::RequestContext, task_mgr::TaskKind};
pub struct Throttle<M: Metric> {
inner: ArcSwap<Inner>,
metric: M,
/// will be turned into [`Stats::count_accounted_start`]
count_accounted_start: AtomicU64,
/// will be turned into [`Stats::count_accounted_finish`]
count_accounted_finish: AtomicU64,
/// will be turned into [`Stats::count_accounted`]
count_accounted: AtomicU64,
/// will be turned into [`Stats::count_throttled`]
count_throttled: AtomicU64,
/// will be turned into [`Stats::sum_throttled_usecs`]
@@ -45,21 +43,17 @@ pub struct Observation {
pub wait_time: Duration,
}
pub trait Metric {
fn accounting_start(&self);
fn accounting_finish(&self);
fn observe_throttling(&self, observation: &Observation);
}
/// See [`Throttle::reset_stats`].
pub struct Stats {
/// Number of requests that started [`Throttle::throttle`] calls.
pub count_accounted_start: u64,
/// Number of requests that finished [`Throttle::throttle`] calls.
pub count_accounted_finish: u64,
/// Subset of the `accounted` requests that were actually throttled.
/// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
// Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
pub count_accounted: u64,
// Subset of the `accounted` requests that were actually throttled.
// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
pub count_throttled: u64,
/// Sum of microseconds that throttled requests spent waiting for throttling.
// Sum of microseconds that throttled requests spent waiting for throttling.
pub sum_throttled_usecs: u64,
}
@@ -71,8 +65,7 @@ where
Self {
inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
metric,
count_accounted_start: AtomicU64::new(0),
count_accounted_finish: AtomicU64::new(0),
count_accounted: AtomicU64::new(0),
count_throttled: AtomicU64::new(0),
sum_throttled_usecs: AtomicU64::new(0),
}
@@ -124,13 +117,11 @@ where
/// This method allows retrieving & resetting that flag.
/// Useful for periodic reporting.
pub fn reset_stats(&self) -> Stats {
let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed);
let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed);
let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
Stats {
count_accounted_start,
count_accounted_finish,
count_accounted,
count_throttled,
sum_throttled_usecs,
}
@@ -148,12 +139,9 @@ where
};
let start = std::time::Instant::now();
self.metric.accounting_start();
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
let did_throttle = inner.rate_limiter.acquire(key_count).await;
self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
self.metric.accounting_finish();
self.count_accounted.fetch_add(1, Ordering::Relaxed);
if did_throttle {
self.count_throttled.fetch_add(1, Ordering::Relaxed);
let now = Instant::now();

View File

@@ -18,6 +18,7 @@ use camino::Utf8Path;
use chrono::{DateTime, Utc};
use enumset::EnumSet;
use fail::fail_point;
use futures::{stream::FuturesUnordered, StreamExt};
use handle::ShardTimelineId;
use once_cell::sync::Lazy;
use pageserver_api::{
@@ -48,7 +49,6 @@ use utils::{
sync::gate::{Gate, GateGuard},
};
use std::pin::pin;
use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::{Arc, Mutex, RwLock, Weak};
use std::time::{Duration, Instant, SystemTime};
@@ -62,13 +62,16 @@ use std::{
collections::btree_map::Entry,
ops::{Deref, Range},
};
use std::{pin::pin, sync::atomic::AtomicUsize};
use crate::{
aux_file::AuxFileSizeEstimator,
tenant::{
layer_map::{LayerMap, SearchResult},
metadata::TimelineMetadata,
storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
storage_layer::{
convert, inmemory_layer::IndexEntry, PersistentLayerDesc, ValueReconstructSituation,
},
},
walredo,
};
@@ -196,8 +199,9 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: RemoteTimelineClient,
pub timeline_get_throttle:
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
pub timeline_get_throttle: Arc<
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
>,
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
}
@@ -405,8 +409,9 @@ pub struct Timeline {
gc_lock: tokio::sync::Mutex<()>,
/// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
timeline_get_throttle:
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
timeline_get_throttle: Arc<
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
>,
/// Keep aux directory cache to avoid it's reconstruction on each update
pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
@@ -563,7 +568,7 @@ impl From<layer_manager::Shutdown> for GetVectoredError {
#[derive(thiserror::Error)]
pub struct MissingKeyError {
key: Key,
keys: KeySpace,
shard: ShardNumber,
cont_lsn: Lsn,
request_lsn: Lsn,
@@ -581,8 +586,8 @@ impl std::fmt::Display for MissingKeyError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
self.key, self.shard, self.cont_lsn, self.request_lsn
"could not find data for keys (shard {:?}) at LSN {}, request LSN {} keys {}",
self.shard, self.cont_lsn, self.request_lsn, self.keys
)?;
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
write!(f, ", ancestor {}", ancestor_lsn)?;
@@ -950,7 +955,11 @@ impl Timeline {
}
}
None => Err(PageReconstructError::MissingKey(MissingKeyError {
key,
keys: {
let mut accum = KeySpaceAccum::new();
accum.add_key(key);
accum.to_keyspace()
},
shard: self.shard_identity.get_shard_number(&key),
cont_lsn: Lsn(0),
request_lsn: lsn,
@@ -1120,29 +1129,53 @@ impl Timeline {
let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
.for_get_kind(get_kind)
.start_timer();
static INVOCATION: Lazy<AtomicUsize> = Lazy::new(|| AtomicUsize::new(0));
let invocation = INVOCATION.fetch_add(1, AtomicOrdering::Relaxed);
self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
.await?;
.instrument(debug_span!("get_vectored_reconstruct_data", invocation))
.await
.map_err(|err| {
anyhow::anyhow!(
"get_vectored_reconstruct_data invocation {invocation} lsn={lsn} keyspace={keyspace} {err:?}",
)
})?;
get_data_timer.stop_and_record();
let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
.for_get_kind(get_kind)
.start_timer();
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
let layers_visited = reconstruct_state.get_layers_visited();
let futs = FuturesUnordered::new();
for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
match res {
Err(err) => {
results.insert(key, Err(err));
}
Ok(state) => {
let state = ValueReconstructState::from(state);
futs.push({
let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
async move {
let state = res.expect("Read path is infallible");
assert!(matches!(
state.situation,
ValueReconstructSituation::Complete
));
let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
results.insert(key, reconstruct_res);
let converted = match convert(key, state).await {
Ok(ok) => ok,
Err(err) => {
return (key, Err(err));
}
};
(
key,
walredo_self.reconstruct_value(key, lsn, converted).await,
)
}
}
});
}
let results = futs
.collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
.await;
reconstruct_timer.stop_and_record();
// For aux file keys (v1 or v2) the vectored read path does not return an error
@@ -3118,7 +3151,7 @@ impl Timeline {
if let Some(missing_keyspace) = missing_keyspace {
return Err(GetVectoredError::MissingKey(MissingKeyError {
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
keys: missing_keyspace.clone(),
shard: self
.shard_identity
.get_shard_number(&missing_keyspace.start().unwrap()),
@@ -4011,9 +4044,7 @@ impl Timeline {
if wrote_keys {
// Normal path: we have written some data into the new image layer for this
// partition, so flush it to disk.
let (desc, path) = image_layer_writer.finish(ctx).await?;
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!("created image layer for rel {}", image_layer.local_path());
let image_layer = image_layer_writer.finish(self, ctx).await?;
Ok(ImageLayerCreationOutcome {
image: Some(image_layer),
next_start_key: img_range.end,
@@ -4101,12 +4132,7 @@ impl Timeline {
if wrote_any_image {
// Normal path: we have written some data into the new image layer for this
// partition, so flush it to disk.
let (desc, path) = image_layer_writer.finish(ctx).await?;
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!(
"created image layer for metadata {}",
image_layer.local_path()
);
let image_layer = image_layer_writer.finish(self, ctx).await?;
Ok(ImageLayerCreationOutcome {
image: Some(image_layer),
next_start_key: img_range.end,
@@ -4314,9 +4340,7 @@ impl Timeline {
timer.stop_and_record();
// Creating image layers may have caused some previously visible layers to be covered
if !image_layers.is_empty() {
self.update_layer_visibility().await?;
}
self.update_layer_visibility().await?;
Ok(image_layers)
}
@@ -5378,8 +5402,7 @@ impl Timeline {
/// Force create an image layer and place it into the layer map.
///
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
/// placed into the layer map in one run AND be validated.
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
#[cfg(test)]
pub(super) async fn force_create_image_layer(
self: &Arc<Timeline>,
@@ -5411,9 +5434,8 @@ impl Timeline {
for (key, img) in images {
image_layer_writer.put_image(key, img, ctx).await?;
}
let (desc, path) = image_layer_writer.finish(ctx).await?;
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!("force created image layer {}", image_layer.local_path());
let image_layer = image_layer_writer.finish(self, ctx).await?;
{
let mut guard = self.layers.write().await;
guard.open_mut().unwrap().force_insert_layer(image_layer);
@@ -5425,8 +5447,7 @@ impl Timeline {
/// Force create a delta layer and place it into the layer map.
///
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
/// placed into the layer map in one run AND be validated.
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
#[cfg(test)]
pub(super) async fn force_create_delta_layer(
self: &Arc<Timeline>,
@@ -5452,6 +5473,33 @@ impl Timeline {
if let Some(check_start_lsn) = check_start_lsn {
assert!(deltas.lsn_range.start >= check_start_lsn);
}
// check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
// layers of the same start/end LSN, and so should the force inserted layer
{
/// Checks if a overlaps with b, assume a/b = [start, end).
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
!(a.end <= b.start || b.end <= a.start)
}
if deltas.key_range.start.next() != deltas.key_range.end {
let guard = self.layers.read().await;
let mut invalid_layers =
guard.layer_map()?.iter_historic_layers().filter(|layer| {
layer.is_delta()
&& overlaps_with(&layer.lsn_range, &deltas.lsn_range)
&& layer.lsn_range != deltas.lsn_range
// skip single-key layer files
&& layer.key_range.start.next() != layer.key_range.end
});
if let Some(layer) = invalid_layers.next() {
// If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
panic!(
"inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
);
}
}
}
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
@@ -5466,7 +5514,7 @@ impl Timeline {
}
let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!("force created delta layer {}", delta_layer.local_path());
{
let mut guard = self.layers.write().await;
guard.open_mut().unwrap().force_insert_layer(delta_layer);
@@ -5479,30 +5527,30 @@ impl Timeline {
#[cfg(test)]
pub(crate) async fn inspect_image_layers(
self: &Arc<Timeline>,
lsn: Lsn,
ctx: &RequestContext,
_lsn: Lsn,
_ctx: &RequestContext,
) -> anyhow::Result<Vec<(Key, Bytes)>> {
let mut all_data = Vec::new();
let guard = self.layers.read().await;
for layer in guard.layer_map()?.iter_historic_layers() {
if !layer.is_delta() && layer.image_layer_lsn() == lsn {
let layer = guard.get_from_desc(&layer);
let mut reconstruct_data = ValuesReconstructState::default();
layer
.get_values_reconstruct_data(
KeySpace::single(Key::MIN..Key::MAX),
lsn..Lsn(lsn.0 + 1),
&mut reconstruct_data,
ctx,
)
.await?;
for (k, v) in reconstruct_data.keys {
all_data.push((k, v?.img.unwrap().1));
}
}
}
all_data.sort();
Ok(all_data)
// let mut all_data = Vec::new();
// let guard = self.layers.read().await;
// for layer in guard.layer_map()?.iter_historic_layers() {
// if !layer.is_delta() && layer.image_layer_lsn() == lsn {
// let layer = guard.get_from_desc(&layer);
// let mut reconstruct_data = ValuesReconstructState::default();
// layer
// .get_values_reconstruct_data(
// KeySpace::single(Key::MIN..Key::MAX),
// lsn..Lsn(lsn.0 + 1),
// &mut reconstruct_data,
// ctx,
// )
// .await?;
// for (k, v) in reconstruct_data.keys {
// all_data.push((k, v?.img.unwrap().1));
// }
// }
// }
// all_data.sort();
Ok(Vec::new())
}
/// Get all historic layer descriptors in the layer map

View File

@@ -29,7 +29,6 @@ use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::remote_timeline_client::WaitCompletionError;
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
use crate::tenant::storage_layer::split_writer::{
@@ -564,12 +563,10 @@ impl Timeline {
.await?;
if keys_written > 0 {
let (desc, path) = image_layer_writer
.finish(ctx)
let new_layer = image_layer_writer
.finish(self, ctx)
.await
.map_err(CompactionError::Other)?;
let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
.map_err(CompactionError::Other)?;
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
layer.metadata().file_size,
new_layer.metadata().file_size);
@@ -1789,12 +1786,20 @@ impl Timeline {
stat.visit_image_layer(desc.file_size());
}
}
let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
.iter()
.map(|layer| layer.layer_desc().layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
bail!("cannot run gc-compaction because {}", err);
for layer in &layer_selection {
let desc = layer.layer_desc();
let key_range = &desc.key_range;
if desc.is_delta() && key_range.start.next() != key_range.end {
let lsn_range = desc.lsn_range.clone();
let intersects = lsn_split_point.range(lsn_range).collect_vec();
if intersects.len() > 1 {
bail!(
"cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
desc.key(),
intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
);
}
}
}
// The maximum LSN we are processing in this compaction loop
let end_lsn = layer_selection
@@ -1804,6 +1809,7 @@ impl Timeline {
.unwrap();
// We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
// as an L0 layer.
let hack_end_key = Key::NON_L0_MAX;
let mut delta_layers = Vec::new();
let mut image_layers = Vec::new();
let mut downloaded_layers = Vec::new();
@@ -1849,8 +1855,10 @@ impl Timeline {
self.conf,
self.timeline_id,
self.tenant_shard_id,
Key::MIN,
lowest_retain_lsn..end_lsn,
self.get_compaction_target_size(),
ctx,
)
.await?;
@@ -1957,7 +1965,7 @@ impl Timeline {
let produced_image_layers = if let Some(writer) = image_layer_writer {
if !dry_run {
writer
.finish_with_discard_fn(self, ctx, Key::MAX, discard)
.finish_with_discard_fn(self, ctx, hack_end_key, discard)
.await?
} else {
let (layers, _) = writer.take()?;
@@ -1970,7 +1978,7 @@ impl Timeline {
let produced_delta_layers = if !dry_run {
delta_layer_writer
.finish_with_discard_fn(self, ctx, discard)
.finish_with_discard_fn(self, ctx, hack_end_key, discard)
.await?
} else {
let (layers, _) = delta_layer_writer.take()?;

Some files were not shown because too many files have changed in this diff Show More