Merge pull request #9921 from neondatabase/rc/release-proxy/2024-11-28

Proxy release 2024-11-28
This commit is contained in:
Folke Behrens
2024-11-28 11:09:06 +01:00
committed by GitHub
215 changed files with 7548 additions and 1761 deletions

View File

@@ -19,8 +19,8 @@ on:
description: 'debug or release'
required: true
type: string
pg-versions:
description: 'a json array of postgres versions to run regression tests on'
test-cfg:
description: 'a json object of postgres versions and lfc states to run regression tests on'
required: true
type: string
@@ -276,14 +276,14 @@ jobs:
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
strategy:
fail-fast: false
matrix:
pg_version: ${{ fromJson(inputs.pg-versions) }}
matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Pytest regression tests
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
uses: ./.github/actions/run-python-test-set
timeout-minutes: 60
with:
@@ -300,6 +300,7 @@ jobs:
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ inputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540

View File

@@ -541,7 +541,7 @@ jobs:
runs-on: ${{ matrix.RUNNER }}
container:
image: neondatabase/build-tools:pinned
image: neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -558,12 +558,12 @@ jobs:
arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
cd /home/nonroot
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.1-1.pgdg110+1_${arch}.deb"
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb"
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.5-1.pgdg110+1_${arch}.deb"
dpkg -x libpq5_17.1-1.pgdg110+1_${arch}.deb pg
dpkg -x postgresql-16_16.5-1.pgdg110+1_${arch}.deb pg
dpkg -x postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb pg
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb"
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb"
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb"
dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg
dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg
dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
mkdir -p /tmp/neon/pg_install/v16/bin
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench

View File

@@ -2,6 +2,17 @@ name: Build build-tools image
on:
workflow_call:
inputs:
archs:
description: "Json array of architectures to build"
# Default values are set in `check-image` job, `set-variables` step
type: string
required: false
debians:
description: "Json array of Debian versions to build"
# Default values are set in `check-image` job, `set-variables` step
type: string
required: false
outputs:
image-tag:
description: "build-tools tag"
@@ -32,25 +43,37 @@ jobs:
check-image:
runs-on: ubuntu-22.04
outputs:
tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
found: ${{ steps.check-image.outputs.found }}
archs: ${{ steps.set-variables.outputs.archs }}
debians: ${{ steps.set-variables.outputs.debians }}
tag: ${{ steps.set-variables.outputs.image-tag }}
everything: ${{ steps.set-more-variables.outputs.everything }}
found: ${{ steps.set-more-variables.outputs.found }}
steps:
- uses: actions/checkout@v4
- name: Get build-tools image tag for the current commit
id: get-build-tools-tag
- name: Set variables
id: set-variables
env:
ARCHS: ${{ inputs.archs || '["x64","arm64"]' }}
DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }}
IMAGE_TAG: |
${{ hashFiles('build-tools.Dockerfile',
'.github/workflows/build-build-tools-image.yml') }}
run: |
echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT
echo "archs=${ARCHS}" | tee -a ${GITHUB_OUTPUT}
echo "debians=${DEBIANS}" | tee -a ${GITHUB_OUTPUT}
echo "image-tag=${IMAGE_TAG}" | tee -a ${GITHUB_OUTPUT}
- name: Check if such tag found in the registry
id: check-image
- name: Set more variables
id: set-more-variables
env:
IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
IMAGE_TAG: ${{ steps.set-variables.outputs.image-tag }}
EVERYTHING: |
${{ contains(fromJson(steps.set-variables.outputs.archs), 'x64') &&
contains(fromJson(steps.set-variables.outputs.archs), 'arm64') &&
contains(fromJson(steps.set-variables.outputs.debians), 'bullseye') &&
contains(fromJson(steps.set-variables.outputs.debians), 'bookworm') }}
run: |
if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
found=true
@@ -58,8 +81,8 @@ jobs:
found=false
fi
echo "found=${found}" | tee -a $GITHUB_OUTPUT
echo "everything=${EVERYTHING}" | tee -a ${GITHUB_OUTPUT}
echo "found=${found}" | tee -a ${GITHUB_OUTPUT}
build-image:
needs: [ check-image ]
@@ -67,8 +90,8 @@ jobs:
strategy:
matrix:
debian-version: [ bullseye, bookworm ]
arch: [ x64, arm64 ]
arch: ${{ fromJson(needs.check-image.outputs.archs) }}
debian: ${{ fromJson(needs.check-image.outputs.debians) }}
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -99,11 +122,11 @@ jobs:
push: true
pull: true
build-args: |
DEBIAN_VERSION=${{ matrix.debian-version }}
cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }}
DEBIAN_VERSION=${{ matrix.debian }}
cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian }}-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian, matrix.arch) || '' }}
tags: |
neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian-version }}-${{ matrix.arch }}
neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian }}-${{ matrix.arch }}
merge-images:
needs: [ check-image, build-image ]
@@ -117,16 +140,22 @@ jobs:
- name: Create multi-arch image
env:
DEFAULT_DEBIAN_VERSION: bullseye
DEFAULT_DEBIAN_VERSION: bookworm
ARCHS: ${{ join(fromJson(needs.check-image.outputs.archs), ' ') }}
DEBIANS: ${{ join(fromJson(needs.check-image.outputs.debians), ' ') }}
EVERYTHING: ${{ needs.check-image.outputs.everything }}
IMAGE_TAG: ${{ needs.check-image.outputs.tag }}
run: |
for debian_version in bullseye bookworm; do
tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}")
if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
for debian in ${DEBIANS}; do
tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian}")
if [ "${EVERYTHING}" == "true" ] && [ "${debian}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}")
fi
docker buildx imagetools create "${tags[@]}" \
neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-x64 \
neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-arm64
for arch in ${ARCHS}; do
tags+=("neondatabase/build-tools:${IMAGE_TAG}-${debian}-${arch}")
done
docker buildx imagetools create "${tags[@]}"
done

View File

@@ -253,7 +253,14 @@ jobs:
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }}
# run without LFC on v17 release only
test-cfg: |
${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
{"pg_version":"v15", "lfc_state": "without-lfc"},
{"pg_version":"v16", "lfc_state": "without-lfc"},
{"pg_version":"v17", "lfc_state": "without-lfc"},
{"pg_version":"v17", "lfc_state": "with-lfc"}]'
|| '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
secrets: inherit
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking

View File

@@ -29,7 +29,7 @@ jobs:
trigger_bench_on_ec2_machine_in_eu_central_1:
runs-on: [ self-hosted, small ]
container:
image: neondatabase/build-tools:pinned
image: neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

View File

@@ -94,7 +94,7 @@ jobs:
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
env:
DEFAULT_DEBIAN_VERSION: bullseye
DEFAULT_DEBIAN_VERSION: bookworm
run: |
for debian_version in bullseye bookworm; do
tags=()

View File

@@ -23,6 +23,8 @@ jobs:
id: python-src
with:
files: |
.github/workflows/_check-codestyle-python.yml
.github/workflows/build-build-tools-image.yml
.github/workflows/pre-merge-checks.yml
**/**.py
poetry.lock
@@ -38,6 +40,10 @@ jobs:
if: needs.get-changed-files.outputs.python-changed == 'true'
needs: [ get-changed-files ]
uses: ./.github/workflows/build-build-tools-image.yml
with:
# Build only one combination to save time
archs: '["x64"]'
debians: '["bookworm"]'
secrets: inherit
check-codestyle-python:
@@ -45,7 +51,8 @@ jobs:
needs: [ get-changed-files, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-python.yml
with:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
secrets: inherit
# To get items from the merge queue merged into main we need to satisfy "Status checks that are required".

View File

@@ -4,10 +4,12 @@ on:
schedule:
- cron: '*/15 * * * *'
- cron: '25 0 * * *'
- cron: '25 1 * * 6'
jobs:
gh-workflow-stats-batch:
name: GitHub Workflow Stats Batch
gh-workflow-stats-batch-2h:
name: GitHub Workflow Stats Batch 2 hours
if: github.event.schedule == '*/15 * * * *'
runs-on: ubuntu-22.04
permissions:
actions: read
@@ -16,14 +18,36 @@ jobs:
uses: neondatabase/gh-workflow-stats-action@v0.2.1
with:
db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
db_table: "gh_workflow_stats_batch_neon"
db_table: "gh_workflow_stats_neon"
gh_token: ${{ secrets.GITHUB_TOKEN }}
duration: '2h'
- name: Export Workflow Run for the past 24 hours
if: github.event.schedule == '25 0 * * *'
gh-workflow-stats-batch-48h:
name: GitHub Workflow Stats Batch 48 hours
if: github.event.schedule == '25 0 * * *'
runs-on: ubuntu-22.04
permissions:
actions: read
steps:
- name: Export Workflow Run for the past 48 hours
uses: neondatabase/gh-workflow-stats-action@v0.2.1
with:
db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
db_table: "gh_workflow_stats_batch_neon"
db_table: "gh_workflow_stats_neon"
gh_token: ${{ secrets.GITHUB_TOKEN }}
duration: '24h'
duration: '48h'
gh-workflow-stats-batch-30d:
name: GitHub Workflow Stats Batch 30 days
if: github.event.schedule == '25 1 * * 6'
runs-on: ubuntu-22.04
permissions:
actions: read
steps:
- name: Export Workflow Run for the past 30 days
uses: neondatabase/gh-workflow-stats-action@v0.2.1
with:
db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
db_table: "gh_workflow_stats_neon"
gh_token: ${{ secrets.GITHUB_TOKEN }}
duration: '720h'

View File

@@ -1,41 +0,0 @@
name: Report Workflow Stats
on:
workflow_run:
workflows:
- Add `external` label to issues and PRs created by external users
- Benchmarking
- Build and Test
- Build and Test Locally
- Build build-tools image
- Check Permissions
- Check neon with extra platform builds
- Cloud Regression Test
- Create Release Branch
- Handle `approved-for-ci-run` label
- Lint GitHub Workflows
- Notify Slack channel about upcoming release
- Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
- Pin build-tools image
- Prepare benchmarking databases by restoring dumps
- Push images to ACR
- Test Postgres client libraries
- Trigger E2E Tests
- cleanup caches by a branch
- Pre-merge checks
types: [completed]
jobs:
gh-workflow-stats:
name: Github Workflow Stats
runs-on: ubuntu-22.04
permissions:
actions: read
steps:
- name: Export GH Workflow Stats
uses: neondatabase/gh-workflow-stats-action@v0.1.4
with:
DB_URI: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
DB_TABLE: "gh_workflow_stats_neon"
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_RUN_ID: ${{ github.event.workflow_run.id }}

317
Cargo.lock generated
View File

@@ -46,6 +46,15 @@ dependencies = [
"memchr",
]
[[package]]
name = "aligned-vec"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e0966165eaf052580bd70eb1b32cb3d6245774c0104d1b2793e9650bf83b52a"
dependencies = [
"equator",
]
[[package]]
name = "allocator-api2"
version = "0.2.16"
@@ -146,6 +155,12 @@ dependencies = [
"static_assertions",
]
[[package]]
name = "arrayvec"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "asn1-rs"
version = "0.6.2"
@@ -359,6 +374,28 @@ dependencies = [
"tracing",
]
[[package]]
name = "aws-sdk-kms"
version = "1.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "564a597a3c71a957d60a2e4c62c93d78ee5a0d636531e15b760acad983a5c18e"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
"aws-types",
"bytes",
"http 0.2.9",
"once_cell",
"regex-lite",
"tracing",
]
[[package]]
name = "aws-sdk-s3"
version = "1.52.0"
@@ -575,9 +612,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime"
version = "1.7.1"
version = "1.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87"
checksum = "a065c0fe6fdbdf9f11817eb68582b2ab4aff9e9c39e986ae48f7ec576c6322db"
dependencies = [
"aws-smithy-async",
"aws-smithy-http",
@@ -742,7 +779,7 @@ dependencies = [
"once_cell",
"paste",
"pin-project",
"quick-xml",
"quick-xml 0.31.0",
"rand 0.8.5",
"reqwest 0.11.19",
"rustc_version",
@@ -1220,6 +1257,10 @@ name = "compute_tools"
version = "0.1.0"
dependencies = [
"anyhow",
"aws-config",
"aws-sdk-kms",
"aws-sdk-s3",
"base64 0.13.1",
"bytes",
"camino",
"cfg-if",
@@ -1237,13 +1278,16 @@ dependencies = [
"opentelemetry",
"opentelemetry_sdk",
"postgres",
"postgres_initdb",
"prometheus",
"regex",
"remote_storage",
"reqwest 0.12.4",
"rlimit",
"rust-ini",
"serde",
"serde_json",
"serde_with",
"signal-hook",
"tar",
"thiserror",
@@ -1381,6 +1425,15 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
[[package]]
name = "cpp_demangle"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96e58d342ad113c2b878f16d5d034c03be492ae460cdbc02b7f0f2284d310c7d"
dependencies = [
"cfg-if",
]
[[package]]
name = "cpufeatures"
version = "0.2.9"
@@ -1904,6 +1957,26 @@ dependencies = [
"termcolor",
]
[[package]]
name = "equator"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c35da53b5a021d2484a7cc49b2ac7f2d840f8236a286f84202369bd338d761ea"
dependencies = [
"equator-macro",
]
[[package]]
name = "equator-macro"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.52",
]
[[package]]
name = "equivalent"
version = "1.0.1"
@@ -2011,6 +2084,18 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "findshlibs"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64"
dependencies = [
"cc",
"lazy_static",
"libc",
"winapi",
]
[[package]]
name = "fixedbitset"
version = "0.4.2"
@@ -2089,9 +2174,9 @@ dependencies = [
[[package]]
name = "futures-channel"
version = "0.3.30"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
dependencies = [
"futures-core",
"futures-sink",
@@ -2099,9 +2184,9 @@ dependencies = [
[[package]]
name = "futures-core"
version = "0.3.30"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "futures-executor"
@@ -2116,9 +2201,9 @@ dependencies = [
[[package]]
name = "futures-io"
version = "0.3.30"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-lite"
@@ -2137,9 +2222,9 @@ dependencies = [
[[package]]
name = "futures-macro"
version = "0.3.30"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
@@ -2148,15 +2233,15 @@ dependencies = [
[[package]]
name = "futures-sink"
version = "0.3.30"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
[[package]]
name = "futures-task"
version = "0.3.30"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
[[package]]
name = "futures-timer"
@@ -2166,9 +2251,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
[[package]]
name = "futures-util"
version = "0.3.30"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [
"futures-channel",
"futures-core",
@@ -2714,6 +2799,24 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac"
[[package]]
name = "inferno"
version = "0.11.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
dependencies = [
"ahash",
"indexmap 2.0.1",
"is-terminal",
"itoa",
"log",
"num-format",
"once_cell",
"quick-xml 0.26.0",
"rgb",
"str_stack",
]
[[package]]
name = "inotify"
version = "0.9.6"
@@ -2764,9 +2867,9 @@ dependencies = [
[[package]]
name = "ipnet"
version = "2.9.0"
version = "2.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708"
[[package]]
name = "is-terminal"
@@ -3053,6 +3156,15 @@ version = "2.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
[[package]]
name = "memmap2"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92"
dependencies = [
"libc",
]
[[package]]
name = "memoffset"
version = "0.7.1"
@@ -3278,6 +3390,16 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-format"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
dependencies = [
"arrayvec",
"itoa",
]
[[package]]
name = "num-integer"
version = "0.1.45"
@@ -3619,6 +3741,7 @@ dependencies = [
"num_cpus",
"once_cell",
"pageserver_api",
"pageserver_client",
"pageserver_compaction",
"pin-project-lite",
"postgres",
@@ -3627,6 +3750,7 @@ dependencies = [
"postgres_backend",
"postgres_connection",
"postgres_ffi",
"postgres_initdb",
"pq_proto",
"procfs",
"rand 0.8.5",
@@ -4009,7 +4133,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4022,7 +4146,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -4041,7 +4165,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4058,7 +4182,7 @@ dependencies = [
"bytes",
"once_cell",
"pq_proto",
"rustls 0.23.16",
"rustls 0.23.18",
"rustls-pemfile 2.1.1",
"serde",
"thiserror",
@@ -4102,12 +4226,48 @@ dependencies = [
"utils",
]
[[package]]
name = "postgres_initdb"
version = "0.1.0"
dependencies = [
"anyhow",
"camino",
"thiserror",
"tokio",
"workspace_hack",
]
[[package]]
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "pprof"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebbe2f8898beba44815fdc9e5a4ae9c929e21c5dc29b0c774a15555f7f58d6d0"
dependencies = [
"aligned-vec",
"backtrace",
"cfg-if",
"criterion",
"findshlibs",
"inferno",
"libc",
"log",
"nix 0.26.4",
"once_cell",
"parking_lot 0.12.1",
"protobuf",
"protobuf-codegen-pure",
"smallvec",
"symbolic-demangle",
"tempfile",
"thiserror",
]
[[package]]
name = "ppv-lite86"
version = "0.2.17"
@@ -4260,6 +4420,31 @@ dependencies = [
"prost",
]
[[package]]
name = "protobuf"
version = "2.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
[[package]]
name = "protobuf-codegen"
version = "2.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6"
dependencies = [
"protobuf",
]
[[package]]
name = "protobuf-codegen-pure"
version = "2.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865"
dependencies = [
"protobuf",
"protobuf-codegen",
]
[[package]]
name = "proxy"
version = "0.1.0"
@@ -4333,7 +4518,7 @@ dependencies = [
"rsa",
"rstest",
"rustc-hash",
"rustls 0.23.16",
"rustls 0.23.18",
"rustls-native-certs 0.8.0",
"rustls-pemfile 2.1.1",
"scopeguard",
@@ -4371,6 +4556,15 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "quick-xml"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f50b1c63b38611e7d4d7f68b82d3ad0cc71a2ad2e7f61fc10f1328d917c93cd"
dependencies = [
"memchr",
]
[[package]]
name = "quick-xml"
version = "0.31.0"
@@ -4853,6 +5047,15 @@ dependencies = [
"subtle",
]
[[package]]
name = "rgb"
version = "0.8.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a"
dependencies = [
"bytemuck",
]
[[package]]
name = "ring"
version = "0.17.6"
@@ -5028,9 +5231,9 @@ dependencies = [
[[package]]
name = "rustls"
version = "0.23.16"
version = "0.23.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e"
checksum = "9c9cc1d47e243d655ace55ed38201c19ae02c148ae56412ab8750e8f0166ab7f"
dependencies = [
"log",
"once_cell",
@@ -5161,11 +5364,13 @@ dependencies = [
"itertools 0.10.5",
"metrics",
"once_cell",
"pageserver_api",
"parking_lot 0.12.1",
"postgres",
"postgres-protocol",
"postgres_backend",
"postgres_ffi",
"pprof",
"pq_proto",
"rand 0.8.5",
"regex",
@@ -5191,6 +5396,7 @@ dependencies = [
"tracing-subscriber",
"url",
"utils",
"wal_decoder",
"walproposer",
"workspace_hack",
]
@@ -5712,6 +5918,12 @@ dependencies = [
"der 0.7.8",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
@@ -5738,7 +5950,7 @@ dependencies = [
"once_cell",
"parking_lot 0.12.1",
"prost",
"rustls 0.23.16",
"rustls 0.23.18",
"tokio",
"tonic",
"tonic-build",
@@ -5821,7 +6033,7 @@ dependencies = [
"postgres_ffi",
"remote_storage",
"reqwest 0.12.4",
"rustls 0.23.16",
"rustls 0.23.18",
"rustls-native-certs 0.8.0",
"serde",
"serde_json",
@@ -5858,6 +6070,12 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "str_stack"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb"
[[package]]
name = "stringprep"
version = "0.1.2"
@@ -5905,6 +6123,29 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca"
[[package]]
name = "symbolic-common"
version = "12.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "366f1b4c6baf6cfefc234bbd4899535fca0b06c74443039a73f6dfb2fad88d77"
dependencies = [
"debugid",
"memmap2",
"stable_deref_trait",
"uuid",
]
[[package]]
name = "symbolic-demangle"
version = "12.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aba05ba5b9962ea5617baf556293720a8b2d0a282aa14ee4bf10e22efc7da8c8"
dependencies = [
"cpp_demangle",
"rustc-demangle",
"symbolic-common",
]
[[package]]
name = "syn"
version = "1.0.109"
@@ -6227,7 +6468,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
dependencies = [
"async-trait",
"byteorder",
@@ -6254,7 +6495,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
dependencies = [
"ring",
"rustls 0.23.16",
"rustls 0.23.18",
"tokio",
"tokio-postgres",
"tokio-rustls 0.26.0",
@@ -6288,7 +6529,7 @@ version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
dependencies = [
"rustls 0.23.16",
"rustls 0.23.18",
"rustls-pki-types",
"tokio",
]
@@ -6697,7 +6938,7 @@ dependencies = [
"base64 0.22.1",
"log",
"once_cell",
"rustls 0.23.16",
"rustls 0.23.18",
"rustls-pki-types",
"url",
"webpki-roots 0.26.1",
@@ -6772,6 +7013,7 @@ dependencies = [
"once_cell",
"pin-project-lite",
"postgres_connection",
"pprof",
"pq_proto",
"rand 0.8.5",
"regex",
@@ -6781,6 +7023,7 @@ dependencies = [
"serde_assert",
"serde_json",
"serde_path_to_error",
"serde_with",
"signal-hook",
"strum",
"strum_macros",
@@ -6877,10 +7120,16 @@ name = "wal_decoder"
version = "0.1.0"
dependencies = [
"anyhow",
"async-compression",
"bytes",
"pageserver_api",
"postgres_ffi",
"prost",
"serde",
"thiserror",
"tokio",
"tonic",
"tonic-build",
"tracing",
"utils",
"workspace_hack",
@@ -7306,6 +7555,7 @@ dependencies = [
"anyhow",
"axum",
"axum-core",
"base64 0.13.1",
"base64 0.21.1",
"base64ct",
"bytes",
@@ -7340,6 +7590,7 @@ dependencies = [
"libc",
"log",
"memchr",
"nix 0.26.4",
"nom",
"num-bigint",
"num-integer",
@@ -7356,7 +7607,7 @@ dependencies = [
"regex-automata 0.4.3",
"regex-syntax 0.8.2",
"reqwest 0.12.4",
"rustls 0.23.16",
"rustls 0.23.18",
"scopeguard",
"serde",
"serde_json",

View File

@@ -34,6 +34,7 @@ members = [
"libs/vm_monitor",
"libs/walproposer",
"libs/wal_decoder",
"libs/postgres_initdb",
]
[workspace.package]
@@ -57,6 +58,7 @@ async-trait = "0.1"
aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
aws-sdk-s3 = "1.52"
aws-sdk-iam = "1.46.0"
aws-sdk-kms = "1.47.0"
aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.2"
aws-credential-types = "1.2.0"
@@ -73,7 +75,7 @@ bytes = "1.0"
camino = "1.1.6"
cfg-if = "1.0.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive"] }
clap = { version = "4.0", features = ["derive", "env"] }
comfy-table = "7.1"
const_format = "0.2"
crc32c = "0.6"
@@ -106,7 +108,7 @@ hyper-util = "0.1"
tokio-tungstenite = "0.21.0"
indexmap = "2"
indoc = "2"
ipnet = "2.9.0"
ipnet = "2.10.0"
itertools = "0.10"
itoa = "1.0.11"
jsonwebtoken = "9"
@@ -130,6 +132,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "protobuf", "protobuf-codec"] }
procfs = "0.16"
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
prost = "0.13"
@@ -153,7 +156,7 @@ sentry = { version = "0.32", default-features = false, features = ["backtrace",
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_path_to_error = "0.1"
serde_with = "2.0"
serde_with = { version = "2.0", features = [ "base64" ] }
serde_assert = "0.5.0"
sha2 = "0.10.2"
signal-hook = "0.3"
@@ -212,12 +215,14 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", br
compute_api = { version = "0.1", path = "./libs/compute_api/" }
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
metrics = { version = "0.1", path = "./libs/metrics/" }
pageserver = { path = "./pageserver" }
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
pageserver_client = { path = "./pageserver/client" }
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
postgres_initdb = { path = "./libs/postgres_initdb" }
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }

View File

@@ -7,7 +7,7 @@ ARG IMAGE=build-tools
ARG TAG=pinned
ARG DEFAULT_PG_VERSION=17
ARG STABLE_PG_VERSION=16
ARG DEBIAN_VERSION=bullseye
ARG DEBIAN_VERSION=bookworm
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
# Build Postgres

View File

@@ -38,6 +38,7 @@ ifeq ($(UNAME_S),Linux)
# Seccomp BPF is only available for Linux
PG_CONFIGURE_OPTS += --with-libseccomp
else ifeq ($(UNAME_S),Darwin)
PG_CFLAGS += -DUSE_PREFETCH
ifndef DISABLE_HOMEBREW
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable

View File

@@ -132,7 +132,7 @@ make -j`sysctl -n hw.logicalcpu` -s
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
To run the integration tests or Python scripts (not required to use the code), install
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
Python (3.11 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
#### Running neon database

View File

@@ -1,4 +1,4 @@
ARG DEBIAN_VERSION=bullseye
ARG DEBIAN_VERSION=bookworm
FROM debian:bookworm-slim AS pgcopydb_builder
ARG DEBIAN_VERSION
@@ -234,7 +234,7 @@ USER nonroot:nonroot
WORKDIR /home/nonroot
# Python
ENV PYTHON_VERSION=3.9.19 \
ENV PYTHON_VERSION=3.11.10 \
PYENV_ROOT=/home/nonroot/.pyenv \
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
RUN set -e \

View File

@@ -3,7 +3,7 @@ ARG REPOSITORY=neondatabase
ARG IMAGE=build-tools
ARG TAG=pinned
ARG BUILD_TAG
ARG DEBIAN_VERSION=bullseye
ARG DEBIAN_VERSION=bookworm
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
#########################################################################################
@@ -1243,7 +1243,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
#########################################################################################
#
# Compile and run the Neon-specific `compute_ctl` binary
# Compile and run the Neon-specific `compute_ctl` and `fast_import` binaries
#
#########################################################################################
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
@@ -1264,6 +1264,7 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de
FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
#########################################################################################
#
@@ -1458,6 +1459,7 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
# pgbouncer and its config
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
@@ -1533,6 +1535,25 @@ RUN apt update && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
# s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2
# used by fast_import
ARG TARGETARCH
ADD https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_linux_$TARGETARCH.deb /tmp/s5cmd.deb
RUN set -ex; \
\
# Determine the expected checksum based on TARGETARCH
if [ "${TARGETARCH}" = "amd64" ]; then \
CHECKSUM="392c385320cd5ffa435759a95af77c215553d967e4b1c0fffe52e4f14c29cf85"; \
elif [ "${TARGETARCH}" = "arm64" ]; then \
CHECKSUM="939bee3cf4b5604ddb00e67f8c157b91d7c7a5b553d1fbb6890fad32894b7b46"; \
else \
echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
fi; \
\
# Compute and validate the checksum
echo "${CHECKSUM} /tmp/s5cmd.deb" | sha256sum -c -
RUN dpkg -i /tmp/s5cmd.deb && rm /tmp/s5cmd.deb
ENV LANG=en_US.utf8
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -10,6 +10,10 @@ default = []
testing = []
[dependencies]
base64.workspace = true
aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-sdk-kms.workspace = true
anyhow.workspace = true
camino.workspace = true
chrono.workspace = true
@@ -27,6 +31,8 @@ opentelemetry.workspace = true
opentelemetry_sdk.workspace = true
postgres.workspace = true
regex.workspace = true
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
tar.workspace = true
@@ -43,6 +49,7 @@ thiserror.workspace = true
url.workspace = true
prometheus.workspace = true
postgres_initdb.workspace = true
compute_api.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -58,7 +58,7 @@ use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version;
use compute_tools::extension_server::get_pg_version_string;
use compute_tools::http::api::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
@@ -326,7 +326,7 @@ fn wait_spec(
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
pgdata: pgdata.to_string(),
pgbin: pgbin.to_string(),
pgversion: get_pg_version(pgbin),
pgversion: get_pg_version_string(pgbin),
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),

View File

@@ -0,0 +1,345 @@
//! This program dumps a remote Postgres database into a local Postgres database
//! and uploads the resulting PGDATA into object storage for import into a Timeline.
//!
//! # Context, Architecture, Design
//!
//! See cloud.git Fast Imports RFC (<https://github.com/neondatabase/cloud/pull/19799>)
//! for the full picture.
//! The RFC describing the storage pieces of importing the PGDATA dump into a Timeline
//! is publicly accessible at <https://github.com/neondatabase/neon/pull/9538>.
//!
//! # This is a Prototype!
//!
//! This program is part of a prototype feature and not yet used in production.
//!
//! The cloud.git RFC contains lots of suggestions for improving e2e throughput
//! of this step of the timeline import process.
//!
//! # Local Testing
//!
//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
//! - Build the image with the following command:
//!
//! ```bash
//! docker buildx build --build-arg DEBIAN_FLAVOR=bullseye-slim --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/Dockerfile.com
//! docker push localhost:3030/localregistry/compute-node-v14:latest
//! ```
use anyhow::Context;
use aws_config::BehaviorVersion;
use camino::{Utf8Path, Utf8PathBuf};
use clap::Parser;
use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
use nix::unistd::Pid;
use tracing::{info, info_span, warn, Instrument};
use utils::fs_ext::is_directory_empty;
#[path = "fast_import/child_stdio_to_log.rs"]
mod child_stdio_to_log;
#[path = "fast_import/s3_uri.rs"]
mod s3_uri;
#[path = "fast_import/s5cmd.rs"]
mod s5cmd;
#[derive(clap::Parser)]
struct Args {
#[clap(long)]
working_directory: Utf8PathBuf,
#[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
s3_prefix: s3_uri::S3Uri,
#[clap(long)]
pg_bin_dir: Utf8PathBuf,
#[clap(long)]
pg_lib_dir: Utf8PathBuf,
}
#[serde_with::serde_as]
#[derive(serde::Deserialize)]
struct Spec {
encryption_secret: EncryptionSecret,
#[serde_as(as = "serde_with::base64::Base64")]
source_connstring_ciphertext_base64: Vec<u8>,
}
#[derive(serde::Deserialize)]
enum EncryptionSecret {
#[allow(clippy::upper_case_acronyms)]
KMS { key_id: String },
}
#[tokio::main]
pub(crate) async fn main() -> anyhow::Result<()> {
utils::logging::init(
utils::logging::LogFormat::Plain,
utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
utils::logging::Output::Stdout,
)?;
info!("starting");
let Args {
working_directory,
s3_prefix,
pg_bin_dir,
pg_lib_dir,
} = Args::parse();
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
let spec: Spec = {
let spec_key = s3_prefix.append("/spec.json");
let s3_client = aws_sdk_s3::Client::new(&aws_config);
let object = s3_client
.get_object()
.bucket(&spec_key.bucket)
.key(spec_key.key)
.send()
.await
.context("get spec from s3")?
.body
.collect()
.await
.context("download spec body")?;
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
};
match tokio::fs::create_dir(&working_directory).await {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
if !is_directory_empty(&working_directory)
.await
.context("check if working directory is empty")?
{
anyhow::bail!("working directory is not empty");
} else {
// ok
}
}
Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
}
let pgdata_dir = working_directory.join("pgdata");
tokio::fs::create_dir(&pgdata_dir)
.await
.context("create pgdata directory")?;
//
// Setup clients
//
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
let kms_client = aws_sdk_kms::Client::new(&aws_config);
//
// Initialize pgdata
//
let pg_version = match get_pg_version(pg_bin_dir.as_str()) {
PostgresMajorVersion::V14 => 14,
PostgresMajorVersion::V15 => 15,
PostgresMajorVersion::V16 => 16,
PostgresMajorVersion::V17 => 17,
};
let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
superuser,
locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded,
pg_version,
initdb_bin: pg_bin_dir.join("initdb").as_ref(),
library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
pgdata: &pgdata_dir,
})
.await
.context("initdb")?;
let nproc = num_cpus::get();
//
// Launch postgres process
//
let mut postgres_proc = tokio::process::Command::new(pg_bin_dir.join("postgres"))
.arg("-D")
.arg(&pgdata_dir)
.args(["-c", "wal_level=minimal"])
.args(["-c", "shared_buffers=10GB"])
.args(["-c", "max_wal_senders=0"])
.args(["-c", "fsync=off"])
.args(["-c", "full_page_writes=off"])
.args(["-c", "synchronous_commit=off"])
.args(["-c", "maintenance_work_mem=8388608"])
.args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
.args(["-c", &format!("max_worker_processes={nproc}")])
.args(["-c", "effective_io_concurrency=100"])
.env_clear()
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.context("spawn postgres")?;
info!("spawned postgres, waiting for it to become ready");
tokio::spawn(
child_stdio_to_log::relay_process_output(
postgres_proc.stdout.take(),
postgres_proc.stderr.take(),
)
.instrument(info_span!("postgres")),
);
let restore_pg_connstring =
format!("host=localhost port=5432 user={superuser} dbname=postgres");
loop {
let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await;
if res.is_ok() {
info!("postgres is ready, could connect to it");
break;
}
}
//
// Decrypt connection string
//
let source_connection_string = {
match spec.encryption_secret {
EncryptionSecret::KMS { key_id } => {
let mut output = kms_client
.decrypt()
.key_id(key_id)
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
spec.source_connstring_ciphertext_base64,
))
.send()
.await
.context("decrypt source connection string")?;
let plaintext = output
.plaintext
.take()
.context("get plaintext source connection string")?;
String::from_utf8(plaintext.into_inner())
.context("parse source connection string as utf8")?
}
}
};
//
// Start the work
//
let dumpdir = working_directory.join("dumpdir");
let common_args = [
// schema mapping (prob suffices to specify them on one side)
"--no-owner".to_string(),
"--no-privileges".to_string(),
"--no-publications".to_string(),
"--no-security-labels".to_string(),
"--no-subscriptions".to_string(),
"--no-tablespaces".to_string(),
// format
"--format".to_string(),
"directory".to_string(),
// concurrency
"--jobs".to_string(),
num_cpus::get().to_string(),
// progress updates
"--verbose".to_string(),
];
info!("dump into the working directory");
{
let mut pg_dump = tokio::process::Command::new(pg_bin_dir.join("pg_dump"))
.args(&common_args)
.arg("-f")
.arg(&dumpdir)
.arg("--no-sync")
// POSITIONAL args
// source db (db name included in connection string)
.arg(&source_connection_string)
// how we run it
.env_clear()
.kill_on_drop(true)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.context("spawn pg_dump")?;
info!(pid=%pg_dump.id().unwrap(), "spawned pg_dump");
tokio::spawn(
child_stdio_to_log::relay_process_output(pg_dump.stdout.take(), pg_dump.stderr.take())
.instrument(info_span!("pg_dump")),
);
let st = pg_dump.wait().await.context("wait for pg_dump")?;
info!(status=?st, "pg_dump exited");
if !st.success() {
warn!(status=%st, "pg_dump failed, restore will likely fail as well");
}
}
// TODO: do it in a streaming way, plenty of internal research done on this already
// TODO: do the unlogged table trick
info!("restore from working directory into vanilla postgres");
{
let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore"))
.args(&common_args)
.arg("-d")
.arg(&restore_pg_connstring)
// POSITIONAL args
.arg(&dumpdir)
// how we run it
.env_clear()
.kill_on_drop(true)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.context("spawn pg_restore")?;
info!(pid=%pg_restore.id().unwrap(), "spawned pg_restore");
tokio::spawn(
child_stdio_to_log::relay_process_output(
pg_restore.stdout.take(),
pg_restore.stderr.take(),
)
.instrument(info_span!("pg_restore")),
);
let st = pg_restore.wait().await.context("wait for pg_restore")?;
info!(status=?st, "pg_restore exited");
if !st.success() {
warn!(status=%st, "pg_restore failed, restore will likely fail as well");
}
}
info!("shutdown postgres");
{
nix::sys::signal::kill(
Pid::from_raw(
i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"),
),
nix::sys::signal::SIGTERM,
)
.context("signal postgres to shut down")?;
postgres_proc
.wait()
.await
.context("wait for postgres to shut down")?;
}
info!("upload pgdata");
s5cmd::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/"))
.await
.context("sync dump directory to destination")?;
info!("write status");
{
let status_dir = working_directory.join("status");
std::fs::create_dir(&status_dir).context("create status directory")?;
let status_file = status_dir.join("status");
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
.context("write status file")?;
s5cmd::sync(&status_file, &s3_prefix.append("/status/pgdata"))
.await
.context("sync status directory to destination")?;
}
Ok(())
}

View File

@@ -0,0 +1,35 @@
use tokio::io::{AsyncBufReadExt, BufReader};
use tokio::process::{ChildStderr, ChildStdout};
use tracing::info;
/// Asynchronously relays the output from a child process's `stdout` and `stderr` to the tracing log.
/// Each line is read and logged individually, with lossy UTF-8 conversion.
///
/// # Arguments
///
/// * `stdout`: An `Option<ChildStdout>` from the child process.
/// * `stderr`: An `Option<ChildStderr>` from the child process.
///
pub(crate) async fn relay_process_output(stdout: Option<ChildStdout>, stderr: Option<ChildStderr>) {
let stdout_fut = async {
if let Some(stdout) = stdout {
let reader = BufReader::new(stdout);
let mut lines = reader.lines();
while let Ok(Some(line)) = lines.next_line().await {
info!(fd = "stdout", "{}", line);
}
}
};
let stderr_fut = async {
if let Some(stderr) = stderr {
let reader = BufReader::new(stderr);
let mut lines = reader.lines();
while let Ok(Some(line)) = lines.next_line().await {
info!(fd = "stderr", "{}", line);
}
}
};
tokio::join!(stdout_fut, stderr_fut);
}

View File

@@ -0,0 +1,75 @@
use anyhow::Result;
use std::str::FromStr;
/// Struct to hold parsed S3 components
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct S3Uri {
pub bucket: String,
pub key: String,
}
impl FromStr for S3Uri {
type Err = anyhow::Error;
/// Parse an S3 URI into a bucket and key
fn from_str(uri: &str) -> Result<Self> {
// Ensure the URI starts with "s3://"
if !uri.starts_with("s3://") {
return Err(anyhow::anyhow!("Invalid S3 URI scheme"));
}
// Remove the "s3://" prefix
let stripped_uri = &uri[5..];
// Split the remaining string into bucket and key parts
if let Some((bucket, key)) = stripped_uri.split_once('/') {
Ok(S3Uri {
bucket: bucket.to_string(),
key: key.to_string(),
})
} else {
Err(anyhow::anyhow!(
"Invalid S3 URI format, missing bucket or key"
))
}
}
}
impl S3Uri {
pub fn append(&self, suffix: &str) -> Self {
Self {
bucket: self.bucket.clone(),
key: format!("{}{}", self.key, suffix),
}
}
}
impl std::fmt::Display for S3Uri {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "s3://{}/{}", self.bucket, self.key)
}
}
impl clap::builder::TypedValueParser for S3Uri {
type Value = Self;
fn parse_ref(
&self,
_cmd: &clap::Command,
_arg: Option<&clap::Arg>,
value: &std::ffi::OsStr,
) -> Result<Self::Value, clap::Error> {
let value_str = value.to_str().ok_or_else(|| {
clap::Error::raw(
clap::error::ErrorKind::InvalidUtf8,
"Invalid UTF-8 sequence",
)
})?;
S3Uri::from_str(value_str).map_err(|e| {
clap::Error::raw(
clap::error::ErrorKind::InvalidValue,
format!("Failed to parse S3 URI: {}", e),
)
})
}
}

View File

@@ -0,0 +1,27 @@
use anyhow::Context;
use camino::Utf8Path;
use super::s3_uri::S3Uri;
pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> {
let mut builder = tokio::process::Command::new("s5cmd");
// s5cmd uses aws-sdk-go v1, hence doesn't support AWS_ENDPOINT_URL
if let Some(val) = std::env::var_os("AWS_ENDPOINT_URL") {
builder.arg("--endpoint-url").arg(val);
}
builder
.arg("sync")
.arg(local.as_str())
.arg(remote.to_string());
let st = builder
.spawn()
.context("spawn s5cmd")?
.wait()
.await
.context("wait for s5cmd")?;
if st.success() {
Ok(())
} else {
Err(anyhow::anyhow!("s5cmd failed"))
}
}

View File

@@ -116,7 +116,7 @@ pub fn write_postgres_conf(
vartype: "enum".to_owned(),
};
write!(file, "{}", opt.to_pg_setting())?;
writeln!(file, "{}", opt.to_pg_setting())?;
}
}

View File

@@ -103,14 +103,33 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {
.to_string()
}
pub fn get_pg_version(pgbin: &str) -> String {
pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion {
// pg_config --version returns a (platform specific) human readable string
// such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
let human_version = get_pg_config("--version", pgbin);
parse_pg_version(&human_version).to_string()
parse_pg_version(&human_version)
}
fn parse_pg_version(human_version: &str) -> &str {
pub fn get_pg_version_string(pgbin: &str) -> String {
match get_pg_version(pgbin) {
PostgresMajorVersion::V14 => "v14",
PostgresMajorVersion::V15 => "v15",
PostgresMajorVersion::V16 => "v16",
PostgresMajorVersion::V17 => "v17",
}
.to_owned()
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum PostgresMajorVersion {
V14,
V15,
V16,
V17,
}
fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
use PostgresMajorVersion::*;
// Normal releases have version strings like "PostgreSQL 15.4". But there
// are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
// 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
@@ -121,10 +140,10 @@ fn parse_pg_version(human_version: &str) -> &str {
.captures(human_version)
{
Some(captures) if captures.len() == 2 => match &captures["major"] {
"14" => return "v14",
"15" => return "v15",
"16" => return "v16",
"17" => return "v17",
"14" => return V14,
"15" => return V15,
"16" => return V16,
"17" => return V17,
_ => {}
},
_ => {}
@@ -263,24 +282,25 @@ mod tests {
#[test]
fn test_parse_pg_version() {
assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
use super::PostgresMajorVersion::*;
assert_eq!(parse_pg_version("PostgreSQL 15.4"), V15);
assert_eq!(parse_pg_version("PostgreSQL 15.14"), V15);
assert_eq!(
parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
"v15"
V15
);
assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
assert_eq!(parse_pg_version("PostgreSQL 14.15"), V14);
assert_eq!(parse_pg_version("PostgreSQL 14.0"), V14);
assert_eq!(
parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
"v14"
V14
);
assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
assert_eq!(parse_pg_version("PostgreSQL 16devel"), V16);
assert_eq!(parse_pg_version("PostgreSQL 16beta1"), V16);
assert_eq!(parse_pg_version("PostgreSQL 16rc2"), V16);
assert_eq!(parse_pg_version("PostgreSQL 16extra"), V16);
}
#[test]

View File

@@ -20,6 +20,7 @@ use anyhow::Result;
use hyper::header::CONTENT_TYPE;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use metrics::proto::MetricFamily;
use metrics::Encoder;
use metrics::TextEncoder;
use tokio::task;
@@ -72,10 +73,22 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
(&Method::GET, "/metrics") => {
debug!("serving /metrics GET request");
let mut buffer = vec![];
let metrics = installed_extensions::collect();
// When we call TextEncoder::encode() below, it will immediately
// return an error if a metric family has no metrics, so we need to
// preemptively filter out metric families with no metrics.
let metrics = installed_extensions::collect()
.into_iter()
.filter(|m| !m.get_metric().is_empty())
.collect::<Vec<MetricFamily>>();
let encoder = TextEncoder::new();
encoder.encode(&metrics, &mut buffer).unwrap();
let mut buffer = vec![];
if let Err(err) = encoder.encode(&metrics, &mut buffer) {
let msg = format!("error handling /metrics request: {err}");
error!(msg);
return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR);
}
match Response::builder()
.status(StatusCode::OK)

View File

@@ -115,7 +115,7 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"installed_extensions",
"compute_installed_extensions",
"Number of databases where the version of extension is installed",
&["extension_name", "version"]
)

View File

@@ -1153,6 +1153,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
timeline_info.timeline_id
);
}
// TODO: rename to import-basebackup-plus-wal
TimelineCmd::Import(args) => {
let tenant_id = get_tenant_id(args.tenant_id, env)?;
let timeline_id = args.timeline_id;

View File

@@ -415,6 +415,11 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'timeline_offloading' as bool")?,
wal_receiver_protocol_override: settings
.remove("wal_receiver_protocol_override")
.map(serde_json::from_str)
.transpose()
.context("parse `wal_receiver_protocol_override` from json")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")

View File

@@ -33,7 +33,6 @@ reason = "the marvin attack only affects private key decryption, not public key
[licenses]
allow = [
"Apache-2.0",
"Artistic-2.0",
"BSD-2-Clause",
"BSD-3-Clause",
"CC0-1.0",
@@ -67,7 +66,7 @@ registries = []
# More documentation about the 'bans' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
[bans]
multiple-versions = "warn"
multiple-versions = "allow"
wildcards = "allow"
highlight = "all"
workspace-default-features = "allow"

View File

@@ -113,21 +113,21 @@ so manual installation of dependencies is not recommended.
A single virtual environment with all dependencies is described in the single `Pipfile`.
### Prerequisites
- Install Python 3.9 (the minimal supported version) or greater.
- Install Python 3.11 (the minimal supported version) or greater.
- Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesn't work as expected.
- If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.:
- If you have some trouble with other version you can resolve it by installing Python 3.11 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.:
```bash
# In Ubuntu
sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt update
sudo apt install python3.9
sudo apt install python3.11
```
- Install `poetry`
- Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation).
- Install dependencies via `./scripts/pysync`.
- Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile))
so if you have different version some linting tools can yield different result locally vs in the CI.
- You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`.
- You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.11`.
This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning.
Run `poetry shell` to activate the virtual environment.

View File

@@ -2,14 +2,28 @@
// This module has heavy inspiration from the prometheus crate's `process_collector.rs`.
use once_cell::sync::Lazy;
use prometheus::Gauge;
use crate::UIntGauge;
pub struct Collector {
descs: Vec<prometheus::core::Desc>,
vmlck: crate::UIntGauge,
cpu_seconds_highres: Gauge,
}
const NMETRICS: usize = 1;
const NMETRICS: usize = 2;
static CLK_TCK_F64: Lazy<f64> = Lazy::new(|| {
let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) };
if long == -1 {
panic!("sysconf(_SC_CLK_TCK) failed");
}
let convertible_to_f64: i32 =
i32::try_from(long).expect("sysconf(_SC_CLK_TCK) is larger than i32");
convertible_to_f64 as f64
});
impl prometheus::core::Collector for Collector {
fn desc(&self) -> Vec<&prometheus::core::Desc> {
@@ -27,6 +41,12 @@ impl prometheus::core::Collector for Collector {
mfs.extend(self.vmlck.collect())
}
}
if let Ok(stat) = myself.stat() {
let cpu_seconds = stat.utime + stat.stime;
self.cpu_seconds_highres
.set(cpu_seconds as f64 / *CLK_TCK_F64);
mfs.extend(self.cpu_seconds_highres.collect());
}
mfs
}
}
@@ -43,7 +63,23 @@ impl Collector {
.cloned(),
);
Self { descs, vmlck }
let cpu_seconds_highres = Gauge::new(
"libmetrics_process_cpu_seconds_highres",
"Total user and system CPU time spent in seconds.\
Sub-second resolution, hence better than `process_cpu_seconds_total`.",
)
.unwrap();
descs.extend(
prometheus::core::Collector::desc(&cpu_seconds_highres)
.into_iter()
.cloned(),
);
Self {
descs,
vmlck,
cpu_seconds_highres,
}
}
}

View File

@@ -33,6 +33,7 @@ remote_storage.workspace = true
postgres_backend.workspace = true
nix = {workspace = true, optional = true}
reqwest.workspace = true
rand.workspace = true
[dev-dependencies]
bincode.workspace = true

View File

@@ -18,7 +18,7 @@ use std::{
str::FromStr,
time::Duration,
};
use utils::logging::LogFormat;
use utils::{logging::LogFormat, postgres_client::PostgresClientProtocol};
use crate::models::ImageCompressionAlgorithm;
use crate::models::LsnLease;
@@ -97,6 +97,15 @@ pub struct ConfigToml {
pub control_plane_api: Option<reqwest::Url>,
pub control_plane_api_token: Option<String>,
pub control_plane_emergency_mode: bool,
/// Unstable feature: subject to change or removal without notice.
/// See <https://github.com/neondatabase/neon/pull/9218>.
pub import_pgdata_upcall_api: Option<reqwest::Url>,
/// Unstable feature: subject to change or removal without notice.
/// See <https://github.com/neondatabase/neon/pull/9218>.
pub import_pgdata_upcall_api_token: Option<String>,
/// Unstable feature: subject to change or removal without notice.
/// See <https://github.com/neondatabase/neon/pull/9218>.
pub import_pgdata_aws_endpoint_url: Option<reqwest::Url>,
pub heatmap_upload_concurrency: usize,
pub secondary_download_concurrency: usize,
pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
@@ -111,6 +120,7 @@ pub struct ConfigToml {
pub no_sync: Option<bool>,
#[serde(with = "humantime_serde")]
pub server_side_batch_timeout: Option<Duration>,
pub wal_receiver_protocol: PostgresClientProtocol,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -268,6 +278,8 @@ pub struct TenantConfigToml {
/// Enable auto-offloading of timelines.
/// (either this flag or the pageserver-global one need to be set)
pub timeline_offloading: bool,
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
}
pub mod defaults {
@@ -321,6 +333,9 @@ pub mod defaults {
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None;
pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
utils::postgres_client::PostgresClientProtocol::Vanilla;
}
impl Default for ConfigToml {
@@ -386,6 +401,10 @@ impl Default for ConfigToml {
control_plane_api_token: (None),
control_plane_emergency_mode: (false),
import_pgdata_upcall_api: (None),
import_pgdata_upcall_api_token: (None),
import_pgdata_aws_endpoint_url: (None),
heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
@@ -405,6 +424,7 @@ impl Default for ConfigToml {
.map(|duration| humantime::parse_duration(duration).unwrap()),
tenant_config: TenantConfigToml::default(),
no_sync: None,
wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
}
}
}
@@ -492,6 +512,7 @@ impl Default for TenantConfigToml {
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
timeline_offloading: false,
wal_receiver_protocol_override: None,
}
}
}

View File

@@ -229,6 +229,18 @@ impl Key {
}
}
impl CompactKey {
pub fn raw(&self) -> i128 {
self.0
}
}
impl From<i128> for CompactKey {
fn from(value: i128) -> Self {
Self(value)
}
}
impl fmt::Display for Key {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(

View File

@@ -48,7 +48,7 @@ pub struct ShardedRange<'a> {
// Calculate the size of a range within the blocks of the same relation, or spanning only the
// top page in the previous relation's space.
fn contiguous_range_len(range: &Range<Key>) -> u32 {
pub fn contiguous_range_len(range: &Range<Key>) -> u32 {
debug_assert!(is_contiguous_range(range));
if range.start.field6 == 0xffffffff {
range.end.field6 + 1
@@ -67,7 +67,7 @@ fn contiguous_range_len(range: &Range<Key>) -> u32 {
/// This matters, because:
/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse.
/// - Within such ranges, we may calculate distances using simple subtraction of field6.
fn is_contiguous_range(range: &Range<Key>) -> bool {
pub fn is_contiguous_range(range: &Range<Key>) -> bool {
range.start.field1 == range.end.field1
&& range.start.field2 == range.end.field2
&& range.start.field3 == range.end.field3

View File

@@ -2,6 +2,8 @@ pub mod detach_ancestor;
pub mod partitioning;
pub mod utilization;
#[cfg(feature = "testing")]
use camino::Utf8PathBuf;
pub use utilization::PageserverUtilization;
use std::{
@@ -21,6 +23,7 @@ use utils::{
completion,
id::{NodeId, TenantId, TimelineId},
lsn::Lsn,
postgres_client::PostgresClientProtocol,
serde_system_time,
};
@@ -227,6 +230,9 @@ pub enum TimelineCreateRequestMode {
// we continue to accept it by having it here.
pg_version: Option<u32>,
},
ImportPgdata {
import_pgdata: TimelineCreateRequestModeImportPgdata,
},
// NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
// (serde picks the first matching enum variant, in declaration order).
Bootstrap {
@@ -236,6 +242,42 @@ pub enum TimelineCreateRequestMode {
},
}
#[derive(Serialize, Deserialize, Clone)]
pub struct TimelineCreateRequestModeImportPgdata {
pub location: ImportPgdataLocation,
pub idempotency_key: ImportPgdataIdempotencyKey,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub enum ImportPgdataLocation {
#[cfg(feature = "testing")]
LocalFs { path: Utf8PathBuf },
AwsS3 {
region: String,
bucket: String,
/// A better name for this would be `prefix`; changing requires coordination with cplane.
/// See <https://github.com/neondatabase/cloud/issues/20646>.
key: String,
},
}
#[derive(Serialize, Deserialize, Clone)]
#[serde(transparent)]
pub struct ImportPgdataIdempotencyKey(pub String);
impl ImportPgdataIdempotencyKey {
pub fn random() -> Self {
use rand::{distributions::Alphanumeric, Rng};
Self(
rand::thread_rng()
.sample_iter(&Alphanumeric)
.take(20)
.map(char::from)
.collect(),
)
}
}
#[derive(Serialize, Deserialize, Clone)]
pub struct LsnLeaseRequest {
pub lsn: Lsn,
@@ -311,6 +353,7 @@ pub struct TenantConfig {
pub lsn_lease_length: Option<String>,
pub lsn_lease_length_for_ts: Option<String>,
pub timeline_offloading: Option<bool>,
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
}
/// The policy for the aux file storage.

View File

@@ -0,0 +1,12 @@
[package]
name = "postgres_initdb"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
anyhow.workspace = true
tokio.workspace = true
camino.workspace = true
thiserror.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -0,0 +1,103 @@
//! The canonical way we run `initdb` in Neon.
//!
//! initdb has implicit defaults that are dependent on the environment, e.g., locales & collations.
//!
//! This module's job is to eliminate the environment-dependence as much as possible.
use std::fmt;
use camino::Utf8Path;
pub struct RunInitdbArgs<'a> {
pub superuser: &'a str,
pub locale: &'a str,
pub initdb_bin: &'a Utf8Path,
pub pg_version: u32,
pub library_search_path: &'a Utf8Path,
pub pgdata: &'a Utf8Path,
}
#[derive(thiserror::Error, Debug)]
pub enum Error {
Spawn(std::io::Error),
Failed {
status: std::process::ExitStatus,
stderr: Vec<u8>,
},
WaitOutput(std::io::Error),
Other(anyhow::Error),
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Error::Spawn(e) => write!(f, "Error spawning command: {:?}", e),
Error::Failed { status, stderr } => write!(
f,
"Command failed with status {:?}: {}",
status,
String::from_utf8_lossy(stderr)
),
Error::WaitOutput(e) => write!(f, "Error waiting for command output: {:?}", e),
Error::Other(e) => write!(f, "Error: {:?}", e),
}
}
}
pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
let RunInitdbArgs {
superuser,
locale,
initdb_bin: initdb_bin_path,
pg_version,
library_search_path,
pgdata,
} = args;
let mut initdb_command = tokio::process::Command::new(initdb_bin_path);
initdb_command
.args(["--pgdata", pgdata.as_ref()])
.args(["--username", superuser])
.args(["--encoding", "utf8"])
.args(["--locale", locale])
.arg("--no-instructions")
.arg("--no-sync")
.env_clear()
.env("LD_LIBRARY_PATH", library_search_path)
.env("DYLD_LIBRARY_PATH", library_search_path)
.stdin(std::process::Stdio::null())
// stdout invocation produces the same output every time, we don't need it
.stdout(std::process::Stdio::null())
// we would be interested in the stderr output, if there was any
.stderr(std::process::Stdio::piped());
// Before version 14, only the libc provide was available.
if pg_version > 14 {
// Version 17 brought with it a builtin locale provider which only provides
// C and C.UTF-8. While being safer for collation purposes since it is
// guaranteed to be consistent throughout a major release, it is also more
// performant.
let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
initdb_command.args(["--locale-provider", locale_provider]);
}
let initdb_proc = initdb_command.spawn().map_err(Error::Spawn)?;
// Ideally we'd select here with the cancellation token, but the problem is that
// we can't safely terminate initdb: it launches processes of its own, and killing
// initdb doesn't kill them. After we return from this function, we want the target
// directory to be able to be cleaned up.
// See https://github.com/neondatabase/neon/issues/6385
let initdb_output = initdb_proc
.wait_with_output()
.await
.map_err(Error::WaitOutput)?;
if !initdb_output.status.success() {
return Err(Error::Failed {
status: initdb_output.status,
stderr: initdb_output.stderr,
});
}
Ok(())
}

View File

@@ -184,9 +184,8 @@ pub struct CancelKeyData {
impl fmt::Display for CancelKeyData {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// TODO: this is producing strange results, with 0xffffffff........ always in the logs.
let hi = (self.backend_pid as u64) << 32;
let lo = self.cancel_key as u64;
let lo = (self.cancel_key as u64) & 0xffffffff;
let id = hi | lo;
// This format is more compact and might work better for logs.
@@ -563,6 +562,9 @@ pub enum BeMessage<'a> {
options: &'a [&'a str],
},
KeepAlive(WalSndKeepAlive),
/// Batch of interpreted, shard filtered WAL records,
/// ready for the pageserver to ingest
InterpretedWalRecords(InterpretedWalRecordsBody<'a>),
}
/// Common shorthands.
@@ -673,6 +675,22 @@ pub struct WalSndKeepAlive {
pub request_reply: bool,
}
/// Batch of interpreted WAL records used in the interpreted
/// safekeeper to pageserver protocol.
///
/// Note that the pageserver uses the RawInterpretedWalRecordsBody
/// counterpart of this from the neondatabase/rust-postgres repo.
/// If you're changing this struct, you likely need to change its
/// twin as well.
#[derive(Debug)]
pub struct InterpretedWalRecordsBody<'a> {
/// End of raw WAL in [`Self::data`]
pub streaming_lsn: u64,
/// Current end of WAL on the server
pub commit_lsn: u64,
pub data: &'a [u8],
}
pub static HELLO_WORLD_ROW: BeMessage = BeMessage::DataRow(&[Some(b"hello world")]);
// single text column
@@ -997,6 +1015,19 @@ impl BeMessage<'_> {
Ok(())
})?
}
BeMessage::InterpretedWalRecords(rec) => {
// We use the COPY_DATA_TAG for our custom message
// since this tag is interpreted as raw bytes.
buf.put_u8(b'd');
write_body(buf, |buf| {
buf.put_u8(b'0'); // matches INTERPRETED_WAL_RECORD_TAG in postgres-protocol
// dependency
buf.put_u64(rec.streaming_lsn);
buf.put_u64(rec.commit_lsn);
buf.put_slice(rec.data);
});
}
}
Ok(())
}
@@ -1047,4 +1078,13 @@ mod tests {
let data = [0, 0, 0, 7, 0, 0, 0, 0];
FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
}
#[test]
fn cancel_key_data() {
let key = CancelKeyData {
backend_pid: -1817212860,
cancel_key: -1183897012,
};
assert_eq!(format!("{key}"), "CancelKeyData(93af8844b96f2a4c)");
}
}

View File

@@ -24,6 +24,7 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl
use bytes::Bytes;
use futures::future::Either;
use futures::stream::Stream;
use futures::FutureExt;
use futures_util::StreamExt;
use futures_util::TryStreamExt;
use http_types::{StatusCode, Url};
@@ -31,6 +32,7 @@ use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use utils::backoff;
use utils::backoff::exponential_backoff_duration_seconds;
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
use crate::{
@@ -302,40 +304,59 @@ impl RemoteStorage for AzureBlobStorage {
let mut next_marker = None;
let mut timeout_try_cnt = 1;
'outer: loop {
let mut builder = builder.clone();
if let Some(marker) = next_marker.clone() {
builder = builder.marker(marker);
}
let response = builder.into_stream();
let response = response.into_stream().map_err(to_download_error);
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
let response = response.map(|res| match res {
Ok(res) => res,
Err(_elapsed) => Err(DownloadError::Timeout),
// Azure Blob Rust SDK does not expose the list blob API directly. Users have to use
// their pageable iterator wrapper that returns all keys as a stream. We want to have
// full control of paging, and therefore we only take the first item from the stream.
let mut response_stream = builder.into_stream();
let response = response_stream.next();
// Timeout mechanism: Azure client will sometimes stuck on a request, but retrying that request
// would immediately succeed. Therefore, we use exponential backoff timeout to retry the request.
// (Usually, exponential backoff is used to determine the sleep time between two retries.) We
// start with 10.0 second timeout, and double the timeout for each failure, up to 5 failures.
// timeout = min(5 * (1.0+1.0)^n, self.timeout).
let this_timeout = (5.0 * exponential_backoff_duration_seconds(timeout_try_cnt, 1.0, self.timeout.as_secs_f64())).min(self.timeout.as_secs_f64());
let response = tokio::time::timeout(Duration::from_secs_f64(this_timeout), response);
let response = response.map(|res| {
match res {
Ok(Some(Ok(res))) => Ok(Some(res)),
Ok(Some(Err(e))) => Err(to_download_error(e)),
Ok(None) => Ok(None),
Err(_elasped) => Err(DownloadError::Timeout),
}
});
let mut response = std::pin::pin!(response);
let mut max_keys = max_keys.map(|mk| mk.get());
let next_item = tokio::select! {
op = response.next() => Ok(op),
op = response => op,
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
}?;
};
if let Err(DownloadError::Timeout) = &next_item {
timeout_try_cnt += 1;
if timeout_try_cnt <= 5 {
continue;
}
}
let next_item = next_item?;
if timeout_try_cnt >= 2 {
tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt);
}
timeout_try_cnt = 1;
let Some(entry) = next_item else {
// The list is complete, so yield it.
break;
};
let mut res = Listing::default();
let entry = match entry {
Ok(entry) => entry,
Err(e) => {
// The error is potentially retryable, so we must rewind the loop after yielding.
yield Err(e);
continue;
}
};
next_marker = entry.continuation();
let prefix_iter = entry
.blobs
@@ -351,7 +372,7 @@ impl RemoteStorage for AzureBlobStorage {
last_modified: k.properties.last_modified.into(),
size: k.properties.content_length,
}
);
);
for key in blob_iter {
res.keys.push(key);

View File

@@ -360,7 +360,12 @@ impl RemoteStorage for LocalFs {
let mut objects = Vec::with_capacity(keys.len());
for key in keys {
let path = key.with_base(&self.storage_root);
let metadata = file_metadata(&path).await?;
let metadata = file_metadata(&path).await;
if let Err(DownloadError::NotFound) = metadata {
// Race: if the file is deleted between listing and metadata check, ignore it.
continue;
}
let metadata = metadata?;
if metadata.is_dir() {
continue;
}

View File

@@ -176,7 +176,9 @@ pub(crate) struct BucketMetrics {
impl Default for BucketMetrics {
fn default() -> Self {
let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
// first bucket 100 microseconds to count requests that do not need to wait at all
// and get a permit immediately
let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
let req_seconds = register_histogram_vec!(
"remote_storage_s3_request_seconds",

View File

@@ -29,9 +29,11 @@ jsonwebtoken.workspace = true
nix.workspace = true
once_cell.workspace = true
pin-project-lite.workspace = true
pprof.workspace = true
regex.workspace = true
routerify.workspace = true
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
thiserror.workspace = true

View File

@@ -1,7 +1,8 @@
use crate::auth::{AuthError, Claims, SwappableJwtAuth};
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
use anyhow::Context;
use hyper::header::{HeaderName, AUTHORIZATION};
use crate::http::request::{get_query_param, parse_query_param};
use anyhow::{anyhow, Context};
use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION};
use hyper::http::HeaderValue;
use hyper::Method;
use hyper::{header::CONTENT_TYPE, Body, Request, Response};
@@ -12,11 +13,13 @@ use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tracing::{debug, info, info_span, warn, Instrument};
use std::future::Future;
use std::io::Write as _;
use std::str::FromStr;
use std::time::Duration;
use bytes::{Bytes, BytesMut};
use std::io::Write as _;
use tokio::sync::mpsc;
use pprof::protos::Message as _;
use tokio::sync::{mpsc, Mutex};
use tokio_stream::wrappers::ReceiverStream;
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -328,6 +331,82 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
Ok(response)
}
/// Generates CPU profiles.
pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
enum Format {
Pprof,
Svg,
}
// Parameters.
let format = match get_query_param(&req, "format")?.as_deref() {
None => Format::Pprof,
Some("pprof") => Format::Pprof,
Some("svg") => Format::Svg,
Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
};
let seconds = match parse_query_param(&req, "seconds")? {
None => 5,
Some(seconds @ 1..=30) => seconds,
Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
};
let frequency_hz = match parse_query_param(&req, "frequency")? {
None => 99,
Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
Some(frequency) => frequency,
};
// Only allow one profiler at a time.
static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
let _lock = PROFILE_LOCK
.try_lock()
.map_err(|_| ApiError::Conflict("profiler already running".into()))?;
// Take the profile.
let report = tokio::task::spawn_blocking(move || {
let guard = pprof::ProfilerGuardBuilder::default()
.frequency(frequency_hz)
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
.build()?;
std::thread::sleep(Duration::from_secs(seconds));
guard.report().build()
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
// Return the report in the requested format.
match format {
Format::Pprof => {
let mut body = Vec::new();
report
.pprof()
.map_err(|err| ApiError::InternalServerError(err.into()))?
.write_to_vec(&mut body)
.map_err(|err| ApiError::InternalServerError(err.into()))?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "application/octet-stream")
.header(CONTENT_DISPOSITION, "attachment; filename=\"profile.pb\"")
.body(Body::from(body))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
Format::Svg => {
let mut body = Vec::new();
report
.flamegraph(&mut body)
.map_err(|err| ApiError::InternalServerError(err.into()))?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "image/svg+xml")
.body(Body::from(body))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
}
}
pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
) -> Middleware<B, ApiError> {
Middleware::pre(move |req| async move {

View File

@@ -30,7 +30,7 @@ pub fn parse_request_param<T: FromStr>(
}
}
fn get_query_param<'a>(
pub fn get_query_param<'a>(
request: &'a Request<Body>,
param_name: &str,
) -> Result<Option<Cow<'a, str>>, ApiError> {

View File

@@ -7,29 +7,88 @@ use postgres_connection::{parse_host_port, PgConnectionConfig};
use crate::id::TenantTimelineId;
#[derive(Copy, Clone, PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum InterpretedFormat {
Bincode,
Protobuf,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum Compression {
Zstd { level: i8 },
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(tag = "type", content = "args")]
#[serde(rename_all = "kebab-case")]
pub enum PostgresClientProtocol {
/// Usual Postgres replication protocol
Vanilla,
/// Custom shard-aware protocol that replicates interpreted records.
/// Used to send wal from safekeeper to pageserver.
Interpreted {
format: InterpretedFormat,
compression: Option<Compression>,
},
}
pub struct ConnectionConfigArgs<'a> {
pub protocol: PostgresClientProtocol,
pub ttid: TenantTimelineId,
pub shard_number: Option<u8>,
pub shard_count: Option<u8>,
pub shard_stripe_size: Option<u32>,
pub listen_pg_addr_str: &'a str,
pub auth_token: Option<&'a str>,
pub availability_zone: Option<&'a str>,
}
impl<'a> ConnectionConfigArgs<'a> {
fn options(&'a self) -> Vec<String> {
let mut options = vec![
"-c".to_owned(),
format!("timeline_id={}", self.ttid.timeline_id),
format!("tenant_id={}", self.ttid.tenant_id),
format!(
"protocol={}",
serde_json::to_string(&self.protocol).unwrap()
),
];
if self.shard_number.is_some() {
assert!(self.shard_count.is_some());
assert!(self.shard_stripe_size.is_some());
options.push(format!("shard_count={}", self.shard_count.unwrap()));
options.push(format!("shard_number={}", self.shard_number.unwrap()));
options.push(format!(
"shard_stripe_size={}",
self.shard_stripe_size.unwrap()
));
}
options
}
}
/// Create client config for fetching WAL from safekeeper on particular timeline.
/// listen_pg_addr_str is in form host:\[port\].
pub fn wal_stream_connection_config(
TenantTimelineId {
tenant_id,
timeline_id,
}: TenantTimelineId,
listen_pg_addr_str: &str,
auth_token: Option<&str>,
availability_zone: Option<&str>,
args: ConnectionConfigArgs,
) -> anyhow::Result<PgConnectionConfig> {
let (host, port) =
parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
parse_host_port(args.listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
let port = port.unwrap_or(5432);
let mut connstr = PgConnectionConfig::new_host_port(host, port)
.extend_options([
"-c".to_owned(),
format!("timeline_id={}", timeline_id),
format!("tenant_id={}", tenant_id),
])
.set_password(auth_token.map(|s| s.to_owned()));
.extend_options(args.options())
.set_password(args.auth_token.map(|s| s.to_owned()));
if let Some(availability_zone) = availability_zone {
if let Some(availability_zone) = args.availability_zone {
connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
}

View File

@@ -83,7 +83,9 @@ where
}
wake_these.push(self.heap.pop().unwrap().wake_channel);
}
self.update_status();
if !wake_these.is_empty() {
self.update_status();
}
wake_these
}

View File

@@ -218,7 +218,7 @@ impl MemoryStatus {
fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
struct DS<'a>(&'a [MemoryStatus]);
impl<'a> Debug for DS<'a> {
impl Debug for DS<'_> {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
f.debug_struct("[MemoryStatus]")
.field(
@@ -233,7 +233,7 @@ impl MemoryStatus {
struct Fields<'a, F>(&'a [MemoryStatus], F);
impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
impl<F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'_, F> {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
f.debug_list().entries(self.0.iter().map(&self.1)).finish()
}

View File

@@ -8,11 +8,19 @@ license.workspace = true
testing = ["pageserver_api/testing"]
[dependencies]
async-compression.workspace = true
anyhow.workspace = true
bytes.workspace = true
pageserver_api.workspace = true
prost.workspace = true
postgres_ffi.workspace = true
serde.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["io-util"] }
tonic.workspace = true
tracing.workspace = true
utils.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[build-dependencies]
tonic-build.workspace = true

11
libs/wal_decoder/build.rs Normal file
View File

@@ -0,0 +1,11 @@
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Generate rust code from .proto protobuf.
//
// Note: we previously tried to use deterministic location at proto/ for
// easy location, but apparently interference with cachepot sometimes fails
// the build then. Anyway, per cargo docs build script shouldn't output to
// anywhere but $OUT_DIR.
tonic_build::compile_protos("proto/interpreted_wal.proto")
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
Ok(())
}

View File

@@ -0,0 +1,43 @@
syntax = "proto3";
package interpreted_wal;
message InterpretedWalRecords {
repeated InterpretedWalRecord records = 1;
optional uint64 next_record_lsn = 2;
}
message InterpretedWalRecord {
optional bytes metadata_record = 1;
SerializedValueBatch batch = 2;
uint64 next_record_lsn = 3;
bool flush_uncommitted = 4;
uint32 xid = 5;
}
message SerializedValueBatch {
bytes raw = 1;
repeated ValueMeta metadata = 2;
uint64 max_lsn = 3;
uint64 len = 4;
}
enum ValueMetaType {
Serialized = 0;
Observed = 1;
}
message ValueMeta {
ValueMetaType type = 1;
CompactKey key = 2;
uint64 lsn = 3;
optional uint64 batch_offset = 4;
optional uint64 len = 5;
optional bool will_init = 6;
}
message CompactKey {
int64 high = 1;
int64 low = 2;
}

View File

@@ -4,6 +4,7 @@
use crate::models::*;
use crate::serialized_batch::SerializedValueBatch;
use bytes::{Buf, Bytes};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::reltag::{RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::pg_constants;
@@ -32,7 +33,8 @@ impl InterpretedWalRecord {
FlushUncommittedRecords::No
};
let metadata_record = MetadataRecord::from_decoded(&decoded, next_record_lsn, pg_version)?;
let metadata_record =
MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
let batch = SerializedValueBatch::from_decoded_filtered(
decoded,
shard,
@@ -51,8 +53,13 @@ impl InterpretedWalRecord {
}
impl MetadataRecord {
fn from_decoded(
/// Builds a metadata record for this WAL record, if any.
///
/// Only metadata records relevant for the given shard are emitted. Currently, most metadata
/// records are broadcast to all shards for simplicity, but this should be improved.
fn from_decoded_filtered(
decoded: &DecodedWALRecord,
shard: &ShardIdentity,
next_record_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<Option<MetadataRecord>> {
@@ -61,26 +68,27 @@ impl MetadataRecord {
let mut buf = decoded.record.clone();
buf.advance(decoded.main_data_offset);
match decoded.xl_rmid {
// First, generate metadata records from the decoded WAL record.
let mut metadata_record = match decoded.xl_rmid {
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
Self::decode_heapam_record(&mut buf, decoded, pg_version)
Self::decode_heapam_record(&mut buf, decoded, pg_version)?
}
pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version),
pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version)?,
// Handle other special record types
pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded),
pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version),
pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded)?,
pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version)?,
pg_constants::RM_TBLSPC_ID => {
tracing::trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
Ok(None)
None
}
pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version)?,
pg_constants::RM_XACT_ID => {
Self::decode_xact_record(&mut buf, decoded, next_record_lsn)
Self::decode_xact_record(&mut buf, decoded, next_record_lsn)?
}
pg_constants::RM_MULTIXACT_ID => {
Self::decode_multixact_record(&mut buf, decoded, pg_version)
Self::decode_multixact_record(&mut buf, decoded, pg_version)?
}
pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded),
pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded)?,
// This is an odd duck. It needs to go to all shards.
// Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
// in WalIngest::new), we have to send the whole DecodedWalRecord::record to
@@ -89,19 +97,48 @@ impl MetadataRecord {
// Alternatively, one can make the checkpoint part of the subscription protocol
// to the pageserver. This should work fine, but can be done at a later point.
pg_constants::RM_XLOG_ID => {
Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)
Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)?
}
pg_constants::RM_LOGICALMSG_ID => {
Self::decode_logical_message_record(&mut buf, decoded)
Self::decode_logical_message_record(&mut buf, decoded)?
}
pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded),
pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded),
pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded)?,
pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded)?,
_unexpected => {
// TODO: consider failing here instead of blindly doing something without
// understanding the protocol
Ok(None)
None
}
};
// Next, filter the metadata record by shard.
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
// of the main relation. These are sharded and managed just like regular relation pages.
// See: https://github.com/neondatabase/neon/issues/9855
if let Some(
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
) = metadata_record
{
let is_local_vm_page = |heap_blk| {
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
};
// Send the old and new VM page updates to their respective shards.
clear_vm_bits.old_heap_blkno = clear_vm_bits
.old_heap_blkno
.filter(|&blkno| is_local_vm_page(blkno));
clear_vm_bits.new_heap_blkno = clear_vm_bits
.new_heap_blkno
.filter(|&blkno| is_local_vm_page(blkno));
// If neither VM page belongs to this shard, discard the record.
if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() {
metadata_record = None
}
}
Ok(metadata_record)
}
fn decode_heapam_record(

View File

@@ -1,3 +1,4 @@
pub mod decoder;
pub mod models;
pub mod serialized_batch;
pub mod wire_format;

View File

@@ -37,12 +37,32 @@ use utils::lsn::Lsn;
use crate::serialized_batch::SerializedValueBatch;
// Code generated by protobuf.
pub mod proto {
// Tonic does derives as `#[derive(Clone, PartialEq, ::prost::Message)]`
// we don't use these types for anything but broker data transmission,
// so it's ok to ignore this one.
#![allow(clippy::derive_partial_eq_without_eq)]
// The generated ValueMeta has a `len` method generate for its `len` field.
#![allow(clippy::len_without_is_empty)]
tonic::include_proto!("interpreted_wal");
}
#[derive(Serialize, Deserialize)]
pub enum FlushUncommittedRecords {
Yes,
No,
}
/// A batch of interpreted WAL records
#[derive(Serialize, Deserialize)]
pub struct InterpretedWalRecords {
pub records: Vec<InterpretedWalRecord>,
// Start LSN of the next record after the batch.
// Note that said record may not belong to the current shard.
pub next_record_lsn: Option<Lsn>,
}
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
#[derive(Serialize, Deserialize)]
pub struct InterpretedWalRecord {
@@ -65,6 +85,18 @@ pub struct InterpretedWalRecord {
pub xid: TransactionId,
}
impl InterpretedWalRecord {
/// Checks if the WAL record is empty
///
/// An empty interpreted WAL record has no data or metadata and does not have to be sent to the
/// pageserver.
pub fn is_empty(&self) -> bool {
self.batch.is_empty()
&& self.metadata_record.is_none()
&& matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
}
}
/// The interpreted part of the Postgres WAL record which requires metadata
/// writes to the underlying storage engine.
#[derive(Serialize, Deserialize)]

View File

@@ -496,11 +496,16 @@ impl SerializedValueBatch {
}
}
/// Checks if the batch is empty
///
/// A batch is empty when it contains no serialized values.
/// Note that it may still contain observed values.
/// Checks if the batch contains any serialized or observed values
pub fn is_empty(&self) -> bool {
!self.has_data() && self.metadata.is_empty()
}
/// Checks if the batch contains data
///
/// Note that if this returns false, it may still contain observed values or
/// a metadata record.
pub fn has_data(&self) -> bool {
let empty = self.raw.is_empty();
if cfg!(debug_assertions) && empty {
@@ -510,7 +515,7 @@ impl SerializedValueBatch {
.all(|meta| matches!(meta, ValueMeta::Observed(_))));
}
empty
!empty
}
/// Returns the number of values serialized in the batch

View File

@@ -0,0 +1,356 @@
use bytes::{BufMut, Bytes, BytesMut};
use pageserver_api::key::CompactKey;
use prost::{DecodeError, EncodeError, Message};
use tokio::io::AsyncWriteExt;
use utils::bin_ser::{BeSer, DeserializeError, SerializeError};
use utils::lsn::Lsn;
use utils::postgres_client::{Compression, InterpretedFormat};
use crate::models::{
FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords, MetadataRecord,
};
use crate::serialized_batch::{
ObservedValueMeta, SerializedValueBatch, SerializedValueMeta, ValueMeta,
};
use crate::models::proto;
#[derive(Debug, thiserror::Error)]
pub enum ToWireFormatError {
#[error("{0}")]
Bincode(#[from] SerializeError),
#[error("{0}")]
Protobuf(#[from] ProtobufSerializeError),
#[error("{0}")]
Compression(#[from] std::io::Error),
}
#[derive(Debug, thiserror::Error)]
pub enum ProtobufSerializeError {
#[error("{0}")]
MetadataRecord(#[from] SerializeError),
#[error("{0}")]
Encode(#[from] EncodeError),
}
#[derive(Debug, thiserror::Error)]
pub enum FromWireFormatError {
#[error("{0}")]
Bincode(#[from] DeserializeError),
#[error("{0}")]
Protobuf(#[from] ProtobufDeserializeError),
#[error("{0}")]
Decompress(#[from] std::io::Error),
}
#[derive(Debug, thiserror::Error)]
pub enum ProtobufDeserializeError {
#[error("{0}")]
Transcode(#[from] TranscodeError),
#[error("{0}")]
Decode(#[from] DecodeError),
}
#[derive(Debug, thiserror::Error)]
pub enum TranscodeError {
#[error("{0}")]
BadInput(String),
#[error("{0}")]
MetadataRecord(#[from] DeserializeError),
}
pub trait ToWireFormat {
fn to_wire(
self,
format: InterpretedFormat,
compression: Option<Compression>,
) -> impl std::future::Future<Output = Result<Bytes, ToWireFormatError>> + Send;
}
pub trait FromWireFormat {
type T;
fn from_wire(
buf: &Bytes,
format: InterpretedFormat,
compression: Option<Compression>,
) -> impl std::future::Future<Output = Result<Self::T, FromWireFormatError>> + Send;
}
impl ToWireFormat for InterpretedWalRecords {
async fn to_wire(
self,
format: InterpretedFormat,
compression: Option<Compression>,
) -> Result<Bytes, ToWireFormatError> {
use async_compression::tokio::write::ZstdEncoder;
use async_compression::Level;
let encode_res: Result<Bytes, ToWireFormatError> = match format {
InterpretedFormat::Bincode => {
let buf = BytesMut::new();
let mut buf = buf.writer();
self.ser_into(&mut buf)?;
Ok(buf.into_inner().freeze())
}
InterpretedFormat::Protobuf => {
let proto: proto::InterpretedWalRecords = self.try_into()?;
let mut buf = BytesMut::new();
proto
.encode(&mut buf)
.map_err(|e| ToWireFormatError::Protobuf(e.into()))?;
Ok(buf.freeze())
}
};
let buf = encode_res?;
let compressed_buf = match compression {
Some(Compression::Zstd { level }) => {
let mut encoder = ZstdEncoder::with_quality(
Vec::with_capacity(buf.len() / 4),
Level::Precise(level as i32),
);
encoder.write_all(&buf).await?;
encoder.shutdown().await?;
Bytes::from(encoder.into_inner())
}
None => buf,
};
Ok(compressed_buf)
}
}
impl FromWireFormat for InterpretedWalRecords {
type T = Self;
async fn from_wire(
buf: &Bytes,
format: InterpretedFormat,
compression: Option<Compression>,
) -> Result<Self, FromWireFormatError> {
let decompressed_buf = match compression {
Some(Compression::Zstd { .. }) => {
use async_compression::tokio::write::ZstdDecoder;
let mut decoded_buf = Vec::with_capacity(buf.len());
let mut decoder = ZstdDecoder::new(&mut decoded_buf);
decoder.write_all(buf).await?;
decoder.flush().await?;
Bytes::from(decoded_buf)
}
None => buf.clone(),
};
match format {
InterpretedFormat::Bincode => {
InterpretedWalRecords::des(&decompressed_buf).map_err(FromWireFormatError::Bincode)
}
InterpretedFormat::Protobuf => {
let proto = proto::InterpretedWalRecords::decode(decompressed_buf)
.map_err(|e| FromWireFormatError::Protobuf(e.into()))?;
InterpretedWalRecords::try_from(proto)
.map_err(|e| FromWireFormatError::Protobuf(e.into()))
}
}
}
}
impl TryFrom<InterpretedWalRecords> for proto::InterpretedWalRecords {
type Error = SerializeError;
fn try_from(value: InterpretedWalRecords) -> Result<Self, Self::Error> {
let records = value
.records
.into_iter()
.map(proto::InterpretedWalRecord::try_from)
.collect::<Result<Vec<_>, _>>()?;
Ok(proto::InterpretedWalRecords {
records,
next_record_lsn: value.next_record_lsn.map(|l| l.0),
})
}
}
impl TryFrom<InterpretedWalRecord> for proto::InterpretedWalRecord {
type Error = SerializeError;
fn try_from(value: InterpretedWalRecord) -> Result<Self, Self::Error> {
let metadata_record = value
.metadata_record
.map(|meta_rec| -> Result<Vec<u8>, Self::Error> {
let mut buf = Vec::new();
meta_rec.ser_into(&mut buf)?;
Ok(buf)
})
.transpose()?;
Ok(proto::InterpretedWalRecord {
metadata_record,
batch: Some(proto::SerializedValueBatch::from(value.batch)),
next_record_lsn: value.next_record_lsn.0,
flush_uncommitted: matches!(value.flush_uncommitted, FlushUncommittedRecords::Yes),
xid: value.xid,
})
}
}
impl From<SerializedValueBatch> for proto::SerializedValueBatch {
fn from(value: SerializedValueBatch) -> Self {
proto::SerializedValueBatch {
raw: value.raw,
metadata: value
.metadata
.into_iter()
.map(proto::ValueMeta::from)
.collect(),
max_lsn: value.max_lsn.0,
len: value.len as u64,
}
}
}
impl From<ValueMeta> for proto::ValueMeta {
fn from(value: ValueMeta) -> Self {
match value {
ValueMeta::Observed(obs) => proto::ValueMeta {
r#type: proto::ValueMetaType::Observed.into(),
key: Some(proto::CompactKey::from(obs.key)),
lsn: obs.lsn.0,
batch_offset: None,
len: None,
will_init: None,
},
ValueMeta::Serialized(ser) => proto::ValueMeta {
r#type: proto::ValueMetaType::Serialized.into(),
key: Some(proto::CompactKey::from(ser.key)),
lsn: ser.lsn.0,
batch_offset: Some(ser.batch_offset),
len: Some(ser.len as u64),
will_init: Some(ser.will_init),
},
}
}
}
impl From<CompactKey> for proto::CompactKey {
fn from(value: CompactKey) -> Self {
proto::CompactKey {
high: (value.raw() >> 64) as i64,
low: value.raw() as i64,
}
}
}
impl TryFrom<proto::InterpretedWalRecords> for InterpretedWalRecords {
type Error = TranscodeError;
fn try_from(value: proto::InterpretedWalRecords) -> Result<Self, Self::Error> {
let records = value
.records
.into_iter()
.map(InterpretedWalRecord::try_from)
.collect::<Result<_, _>>()?;
Ok(InterpretedWalRecords {
records,
next_record_lsn: value.next_record_lsn.map(Lsn::from),
})
}
}
impl TryFrom<proto::InterpretedWalRecord> for InterpretedWalRecord {
type Error = TranscodeError;
fn try_from(value: proto::InterpretedWalRecord) -> Result<Self, Self::Error> {
let metadata_record = value
.metadata_record
.map(|mrec| -> Result<_, DeserializeError> { MetadataRecord::des(&mrec) })
.transpose()?;
let batch = {
let batch = value.batch.ok_or_else(|| {
TranscodeError::BadInput("InterpretedWalRecord::batch missing".to_string())
})?;
SerializedValueBatch::try_from(batch)?
};
Ok(InterpretedWalRecord {
metadata_record,
batch,
next_record_lsn: Lsn(value.next_record_lsn),
flush_uncommitted: if value.flush_uncommitted {
FlushUncommittedRecords::Yes
} else {
FlushUncommittedRecords::No
},
xid: value.xid,
})
}
}
impl TryFrom<proto::SerializedValueBatch> for SerializedValueBatch {
type Error = TranscodeError;
fn try_from(value: proto::SerializedValueBatch) -> Result<Self, Self::Error> {
let metadata = value
.metadata
.into_iter()
.map(ValueMeta::try_from)
.collect::<Result<Vec<_>, _>>()?;
Ok(SerializedValueBatch {
raw: value.raw,
metadata,
max_lsn: Lsn(value.max_lsn),
len: value.len as usize,
})
}
}
impl TryFrom<proto::ValueMeta> for ValueMeta {
type Error = TranscodeError;
fn try_from(value: proto::ValueMeta) -> Result<Self, Self::Error> {
match proto::ValueMetaType::try_from(value.r#type) {
Ok(proto::ValueMetaType::Serialized) => {
Ok(ValueMeta::Serialized(SerializedValueMeta {
key: value
.key
.ok_or_else(|| {
TranscodeError::BadInput("ValueMeta::key missing".to_string())
})?
.into(),
lsn: Lsn(value.lsn),
batch_offset: value.batch_offset.ok_or_else(|| {
TranscodeError::BadInput("ValueMeta::batch_offset missing".to_string())
})?,
len: value.len.ok_or_else(|| {
TranscodeError::BadInput("ValueMeta::len missing".to_string())
})? as usize,
will_init: value.will_init.ok_or_else(|| {
TranscodeError::BadInput("ValueMeta::will_init missing".to_string())
})?,
}))
}
Ok(proto::ValueMetaType::Observed) => Ok(ValueMeta::Observed(ObservedValueMeta {
key: value
.key
.ok_or_else(|| TranscodeError::BadInput("ValueMeta::key missing".to_string()))?
.into(),
lsn: Lsn(value.lsn),
})),
Err(_) => Err(TranscodeError::BadInput(format!(
"Unexpected ValueMeta::type {}",
value.r#type
))),
}
}
}
impl From<proto::CompactKey> for CompactKey {
fn from(value: proto::CompactKey) -> Self {
(((value.high as i128) << 64) | (value.low as i128)).into()
}
}

View File

@@ -43,6 +43,7 @@ postgres.workspace = true
postgres_backend.workspace = true
postgres-protocol.workspace = true
postgres-types.workspace = true
postgres_initdb.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true
@@ -68,6 +69,7 @@ url.workspace = true
walkdir.workspace = true
metrics.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
pageserver_compaction.workspace = true
postgres_connection.workspace = true
postgres_ffi.workspace = true

View File

@@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> {
// after setting up logging, log the effective IO engine choice and read path implementations
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
// The tenants directory contains all the pageserver local disk state.
// Create if not exists and make sure all the contents are durable before proceeding.

View File

@@ -14,6 +14,7 @@ use remote_storage::{RemotePath, RemoteStorageConfig};
use std::env;
use storage_broker::Uri;
use utils::logging::SecretString;
use utils::postgres_client::PostgresClientProtocol;
use once_cell::sync::OnceCell;
use reqwest::Url;
@@ -144,6 +145,10 @@ pub struct PageServerConf {
/// JWT token for use with the control plane API.
pub control_plane_api_token: Option<SecretString>,
pub import_pgdata_upcall_api: Option<Url>,
pub import_pgdata_upcall_api_token: Option<SecretString>,
pub import_pgdata_aws_endpoint_url: Option<Url>,
/// If true, pageserver will make best-effort to operate without a control plane: only
/// for use in major incidents.
pub control_plane_emergency_mode: bool,
@@ -186,6 +191,8 @@ pub struct PageServerConf {
/// Maximum amount of time for which a get page request request
/// might be held up for request merging.
pub server_side_batch_timeout: Option<Duration>,
pub wal_receiver_protocol: PostgresClientProtocol,
}
/// Token for authentication to safekeepers
@@ -328,6 +335,9 @@ impl PageServerConf {
control_plane_api,
control_plane_api_token,
control_plane_emergency_mode,
import_pgdata_upcall_api,
import_pgdata_upcall_api_token,
import_pgdata_aws_endpoint_url,
heatmap_upload_concurrency,
secondary_download_concurrency,
ingest_batch_size,
@@ -343,6 +353,7 @@ impl PageServerConf {
server_side_batch_timeout,
tenant_config,
no_sync,
wal_receiver_protocol,
} = config_toml;
let mut conf = PageServerConf {
@@ -383,6 +394,10 @@ impl PageServerConf {
timeline_offloading,
ephemeral_bytes_per_memory_kb,
server_side_batch_timeout,
import_pgdata_upcall_api,
import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
import_pgdata_aws_endpoint_url,
wal_receiver_protocol,
// ------------------------------------------------------------
// fields that require additional validation or custom handling

View File

@@ -1144,18 +1144,24 @@ pub(crate) mod mock {
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
cancel: CancellationToken,
executed: Arc<AtomicUsize>,
}
impl ConsumerState {
async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize {
let mut executed = 0;
async fn consume(&mut self, remote_storage: &GenericRemoteStorage) {
info!("Executing all pending deletions");
// Transform all executor messages to generic frontend messages
while let Ok(msg) = self.executor_rx.try_recv() {
loop {
use either::Either;
let msg = tokio::select! {
left = self.executor_rx.recv() => Either::Left(left),
right = self.rx.recv() => Either::Right(right),
};
match msg {
DeleterMessage::Delete(objects) => {
Either::Left(None) => break,
Either::Right(None) => break,
Either::Left(Some(DeleterMessage::Delete(objects))) => {
for path in objects {
match remote_storage.delete(&path, &self.cancel).await {
Ok(_) => {
@@ -1165,18 +1171,13 @@ pub(crate) mod mock {
error!("Failed to delete {path}, leaking object! ({e})");
}
}
executed += 1;
self.executed.fetch_add(1, Ordering::Relaxed);
}
}
DeleterMessage::Flush(flush_op) => {
Either::Left(Some(DeleterMessage::Flush(flush_op))) => {
flush_op.notify();
}
}
}
while let Ok(msg) = self.rx.try_recv() {
match msg {
ListWriterQueueMessage::Delete(op) => {
Either::Right(Some(ListWriterQueueMessage::Delete(op))) => {
let mut objects = op.objects;
for (layer, meta) in op.layers {
objects.push(remote_layer_path(
@@ -1198,33 +1199,27 @@ pub(crate) mod mock {
error!("Failed to delete {path}, leaking object! ({e})");
}
}
executed += 1;
self.executed.fetch_add(1, Ordering::Relaxed);
}
}
ListWriterQueueMessage::Flush(op) => {
Either::Right(Some(ListWriterQueueMessage::Flush(op))) => {
op.notify();
}
ListWriterQueueMessage::FlushExecute(op) => {
Either::Right(Some(ListWriterQueueMessage::FlushExecute(op))) => {
// We have already executed all prior deletions because mock does them inline
op.notify();
}
ListWriterQueueMessage::Recover(_) => {
Either::Right(Some(ListWriterQueueMessage::Recover(_))) => {
// no-op in mock
}
}
info!("All pending deletions have been executed");
}
executed
}
}
pub struct MockDeletionQueue {
tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
executed: Arc<AtomicUsize>,
remote_storage: Option<GenericRemoteStorage>,
consumer: std::sync::Mutex<ConsumerState>,
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
}
@@ -1235,29 +1230,34 @@ pub(crate) mod mock {
let executed = Arc::new(AtomicUsize::new(0));
let mut consumer = ConsumerState {
rx,
executor_rx,
cancel: CancellationToken::new(),
executed: executed.clone(),
};
tokio::spawn(async move {
if let Some(remote_storage) = &remote_storage {
consumer.consume(remote_storage).await;
}
});
Self {
tx,
executor_tx,
executed,
remote_storage,
consumer: std::sync::Mutex::new(ConsumerState {
rx,
executor_rx,
cancel: CancellationToken::new(),
}),
lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
}
}
#[allow(clippy::await_holding_lock)]
pub async fn pump(&self) {
if let Some(remote_storage) = &self.remote_storage {
// Permit holding mutex across await, because this is only ever
// called once at a time in tests.
let mut locked = self.consumer.lock().unwrap();
let count = locked.consume(remote_storage).await;
self.executed.fetch_add(count, Ordering::Relaxed);
}
let (tx, rx) = tokio::sync::oneshot::channel();
self.executor_tx
.send(DeleterMessage::Flush(FlushOp { tx }))
.await
.expect("Failed to send flush message");
rx.await.ok();
}
pub(crate) fn new_client(&self) -> DeletionQueueClient {

View File

@@ -15,6 +15,7 @@ use tokio_util::sync::CancellationToken;
use tracing::info;
use tracing::warn;
use utils::backoff;
use utils::pausable_failpoint;
use crate::metrics;
@@ -90,6 +91,7 @@ impl Deleter {
/// Block until everything in accumulator has been executed
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
pausable_failpoint!("deletion-queue-before-execute-pause");
match self.remote_delete().await {
Ok(()) => {
// Note: we assume that the remote storage layer returns Ok(()) if some

View File

@@ -623,6 +623,8 @@ paths:
existing_initdb_timeline_id:
type: string
format: hex
import_pgdata:
$ref: "#/components/schemas/TimelineCreateRequestImportPgdata"
responses:
"201":
description: Timeline was created, or already existed with matching parameters
@@ -979,6 +981,34 @@ components:
$ref: "#/components/schemas/TenantConfig"
effective_config:
$ref: "#/components/schemas/TenantConfig"
TimelineCreateRequestImportPgdata:
type: object
required:
- location
- idempotency_key
properties:
idempotency_key:
type: string
location:
$ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocation"
TimelineCreateRequestImportPgdataLocation:
type: object
properties:
AwsS3:
$ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocationAwsS3"
TimelineCreateRequestImportPgdataLocationAwsS3:
type: object
properties:
region:
type: string
bucket:
type: string
key:
type: string
required:
- region
- bucket
- key
TimelineInfo:
type: object
required:

View File

@@ -40,6 +40,7 @@ use pageserver_api::models::TenantSorting;
use pageserver_api::models::TenantState;
use pageserver_api::models::TimelineArchivalConfigRequest;
use pageserver_api::models::TimelineCreateRequestMode;
use pageserver_api::models::TimelineCreateRequestModeImportPgdata;
use pageserver_api::models::TimelinesInfoAndOffloaded;
use pageserver_api::models::TopTenantShardItem;
use pageserver_api::models::TopTenantShardsRequest;
@@ -55,6 +56,7 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::auth::JwtAuth;
use utils::failpoint_support::failpoints_handler;
use utils::http::endpoint::profile_cpu_handler;
use utils::http::endpoint::prometheus_metrics_handler;
use utils::http::endpoint::request_span;
use utils::http::request::must_parse_query_param;
@@ -80,6 +82,7 @@ use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::timeline::import_pgdata;
use crate::tenant::timeline::offload::offload_timeline;
use crate::tenant::timeline::offload::OffloadError;
use crate::tenant::timeline::CompactFlags;
@@ -125,7 +128,7 @@ pub struct State {
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
allowlist_routes: Vec<Uri>,
allowlist_routes: &'static [&'static str],
remote_storage: GenericRemoteStorage,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -146,10 +149,13 @@ impl State {
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
.iter()
.map(|v| v.parse().unwrap())
.collect::<Vec<_>>();
let allowlist_routes = &[
"/v1/status",
"/v1/doc",
"/swagger.yml",
"/metrics",
"/profile/cpu",
];
Ok(Self {
conf,
tenant_manager,
@@ -576,6 +582,35 @@ async fn timeline_create_handler(
ancestor_timeline_id,
ancestor_start_lsn,
}),
TimelineCreateRequestMode::ImportPgdata {
import_pgdata:
TimelineCreateRequestModeImportPgdata {
location,
idempotency_key,
},
} => tenant::CreateTimelineParams::ImportPgdata(tenant::CreateTimelineParamsImportPgdata {
idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new(
idempotency_key.0,
),
new_timeline_id,
location: {
use import_pgdata::index_part_format::Location;
use pageserver_api::models::ImportPgdataLocation;
match location {
#[cfg(feature = "testing")]
ImportPgdataLocation::LocalFs { path } => Location::LocalFs { path },
ImportPgdataLocation::AwsS3 {
region,
bucket,
key,
} => Location::AwsS3 {
region,
bucket,
key,
},
}
},
}),
};
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
@@ -3148,7 +3183,7 @@ pub fn make_router(
if auth.is_some() {
router = router.middleware(auth_middleware(|request| {
let state = get_state(request);
if state.allowlist_routes.contains(request.uri()) {
if state.allowlist_routes.contains(&request.uri().path()) {
None
} else {
state.auth.as_deref()
@@ -3167,6 +3202,7 @@ pub fn make_router(
Ok(router
.data(state)
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
.get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
.get("/v1/status", |r| api_handler(r, status_handler))
.put("/v1/failpoints", |r| {
testing_api_handler("manage failpoints", r, failpoints_handler)

View File

@@ -3,7 +3,7 @@ use metrics::{
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
@@ -457,6 +457,15 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
register_gauge_vec!(
"pageserver_flush_wait_upload_seconds",
"Time spent waiting for preceding uploads during layer flush",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_last_record_lsn",
@@ -653,6 +662,35 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_relsize_cache_entries",
"Number of entries in the relation size cache",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_relsize_cache_misses",
"Relation size cache misses",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_relsize_cache_misses_old",
"Relation size cache misses where the lookup LSN is older than the last relation update"
)
.expect("failed to define a metric")
});
pub(crate) mod initial_logical_size {
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
@@ -2106,6 +2144,7 @@ pub(crate) struct WalIngestMetrics {
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
pub(crate) clear_vm_bits_unknown: IntCounterVec,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -2134,6 +2173,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
"Total number of zero gap blocks written on relation extends"
)
.expect("failed to define a metric"),
clear_vm_bits_unknown: register_int_counter_vec!(
"pageserver_wal_ingest_clear_vm_bits_unknown",
"Number of ignored ClearVmBits operations due to unknown pages/relations",
&["entity"],
)
.expect("failed to define a metric"),
});
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -2336,6 +2381,7 @@ pub(crate) struct TimelineMetrics {
shard_id: String,
timeline_id: String,
pub flush_time_histo: StorageTimeMetrics,
pub flush_wait_upload_time_gauge: Gauge,
pub compact_time_histo: StorageTimeMetrics,
pub create_images_time_histo: StorageTimeMetrics,
pub logical_size_histo: StorageTimeMetrics,
@@ -2379,6 +2425,9 @@ impl TimelineMetrics {
&shard_id,
&timeline_id,
);
let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let compact_time_histo = StorageTimeMetrics::new(
StorageTimeOperation::Compact,
&tenant_id,
@@ -2516,6 +2565,7 @@ impl TimelineMetrics {
shard_id,
timeline_id,
flush_time_histo,
flush_wait_upload_time_gauge,
compact_time_histo,
create_images_time_histo,
logical_size_histo,
@@ -2563,6 +2613,14 @@ impl TimelineMetrics {
self.resident_physical_size_gauge.get()
}
pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
self.flush_wait_upload_time_gauge.add(duration);
crate::metrics::FLUSH_WAIT_UPLOAD_TIME
.get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
.unwrap()
.add(duration);
}
pub(crate) fn shutdown(&self) {
let was_shutdown = self
.shutdown
@@ -2579,6 +2637,7 @@ impl TimelineMetrics {
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());

View File

@@ -1068,21 +1068,26 @@ impl PageServerHandler {
));
}
if request_lsn < **latest_gc_cutoff_lsn {
// Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus
if request_lsn == Lsn::INVALID {
return Err(PageStreamError::BadRequest(
"invalid LSN(0) in request".into(),
));
}
// Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease.
//
// We may have older data available, but we make a best effort to detect this case and return an error,
// to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
let gc_info = &timeline.gc_info.read().unwrap();
if !gc_info.leases.contains_key(&request_lsn) {
// The requested LSN is below gc cutoff and is not guarded by a lease.
// Check explicitly for INVALID just to get a less scary error message if the
// request is obviously bogus
return Err(if request_lsn == Lsn::INVALID {
PageStreamError::BadRequest("invalid LSN(0) in request".into())
} else {
return Err(
PageStreamError::BadRequest(format!(
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
request_lsn, **latest_gc_cutoff_lsn
).into())
});
);
}
}

View File

@@ -10,6 +10,9 @@ use super::tenant::{PageReconstructError, Timeline};
use crate::aux_file;
use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::{
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
};
use crate::span::{
debug_assert_current_span_has_tenant_and_timeline_id,
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -389,7 +392,9 @@ impl Timeline {
result
}
// Get size of a database in blocks
/// Get size of a database in blocks. This is only accurate on shard 0. It will undercount on
/// other shards, by only accounting for relations the shard has pages for, and only accounting
/// for pages up to the highest page number it has stored.
pub(crate) async fn get_db_size(
&self,
spcnode: Oid,
@@ -408,7 +413,10 @@ impl Timeline {
Ok(total_blocks)
}
/// Get size of a relation file
/// Get size of a relation file. The relation must exist, otherwise an error is returned.
///
/// This is only accurate on shard 0. On other shards, it will return the size up to the highest
/// page number stored in the shard.
pub(crate) async fn get_rel_size(
&self,
tag: RelTag,
@@ -444,7 +452,10 @@ impl Timeline {
Ok(nblocks)
}
/// Does relation exist?
/// Does the relation exist?
///
/// Only shard 0 has a full view of the relations. Other shards only know about relations that
/// the shard stores pages for.
pub(crate) async fn get_rel_exists(
&self,
tag: RelTag,
@@ -478,6 +489,9 @@ impl Timeline {
/// Get a list of all existing relations in given tablespace and database.
///
/// Only shard 0 has a full view of the relations. Other shards only know about relations that
/// the shard stores pages for.
///
/// # Cancel-Safety
///
/// This method is cancellation-safe.
@@ -1129,9 +1143,12 @@ impl Timeline {
let rel_size_cache = self.rel_size_cache.read().unwrap();
if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
if lsn >= *cached_lsn {
RELSIZE_CACHE_HITS.inc();
return Some(*nblocks);
}
RELSIZE_CACHE_MISSES_OLD.inc();
}
RELSIZE_CACHE_MISSES.inc();
None
}
@@ -1156,6 +1173,7 @@ impl Timeline {
}
hash_map::Entry::Vacant(entry) => {
entry.insert((lsn, nblocks));
RELSIZE_CACHE_ENTRIES.inc();
}
}
}
@@ -1163,13 +1181,17 @@ impl Timeline {
/// Store cached relation size
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
rel_size_cache.map.insert(tag, (lsn, nblocks));
if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
RELSIZE_CACHE_ENTRIES.inc();
}
}
/// Remove cached relation size
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
rel_size_cache.map.remove(tag);
if rel_size_cache.map.remove(tag).is_some() {
RELSIZE_CACHE_ENTRIES.dec();
}
}
}
@@ -1229,10 +1251,9 @@ impl<'a> DatadirModification<'a> {
}
pub(crate) fn has_dirty_data(&self) -> bool {
!self
.pending_data_batch
self.pending_data_batch
.as_ref()
.map_or(true, |b| b.is_empty())
.map_or(false, |b| b.has_data())
}
/// Set the current lsn
@@ -1408,7 +1429,7 @@ impl<'a> DatadirModification<'a> {
Some(pending_batch) => {
pending_batch.extend(batch);
}
None if !batch.is_empty() => {
None if batch.has_data() => {
self.pending_data_batch = Some(batch);
}
None => {
@@ -2276,9 +2297,9 @@ impl<'a> Version<'a> {
//--- Metadata structs stored in key-value pairs in the repository.
#[derive(Debug, Serialize, Deserialize)]
struct DbDirectory {
pub(crate) struct DbDirectory {
// (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist)
dbdirs: HashMap<(Oid, Oid), bool>,
pub(crate) dbdirs: HashMap<(Oid, Oid), bool>,
}
// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of
@@ -2287,8 +2308,8 @@ struct DbDirectory {
// "pg_twophsae/0000000A000002E4".
#[derive(Debug, Serialize, Deserialize)]
struct TwoPhaseDirectory {
xids: HashSet<TransactionId>,
pub(crate) struct TwoPhaseDirectory {
pub(crate) xids: HashSet<TransactionId>,
}
#[derive(Debug, Serialize, Deserialize)]
@@ -2297,12 +2318,12 @@ struct TwoPhaseDirectoryV17 {
}
#[derive(Debug, Serialize, Deserialize, Default)]
struct RelDirectory {
pub(crate) struct RelDirectory {
// Set of relations that exist. (relfilenode, forknum)
//
// TODO: Store it as a btree or radix tree or something else that spans multiple
// key-value pairs, if you have a lot of relations
rels: HashSet<(Oid, u8)>,
pub(crate) rels: HashSet<(Oid, u8)>,
}
#[derive(Debug, Serialize, Deserialize)]
@@ -2311,9 +2332,9 @@ struct RelSizeEntry {
}
#[derive(Debug, Serialize, Deserialize, Default)]
struct SlruSegmentDirectory {
pub(crate) struct SlruSegmentDirectory {
// Set of SLRU segments that exist.
segments: HashSet<u32>,
pub(crate) segments: HashSet<u32>,
}
#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]

View File

@@ -381,6 +381,8 @@ pub enum TaskKind {
UnitTest,
DetachAncestor,
ImportPgdata,
}
#[derive(Default)]

View File

@@ -43,7 +43,9 @@ use std::sync::atomic::AtomicBool;
use std::sync::Weak;
use std::time::SystemTime;
use storage_broker::BrokerClientChannel;
use timeline::import_pgdata;
use timeline::offload::offload_timeline;
use timeline::ShutdownMode;
use tokio::io::BufReader;
use tokio::sync::watch;
use tokio::task::JoinSet;
@@ -373,7 +375,6 @@ pub struct Tenant {
l0_flush_global_state: L0FlushGlobalState,
}
impl std::fmt::Debug for Tenant {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
@@ -860,6 +861,7 @@ impl Debug for SetStoppingError {
pub(crate) enum CreateTimelineParams {
Bootstrap(CreateTimelineParamsBootstrap),
Branch(CreateTimelineParamsBranch),
ImportPgdata(CreateTimelineParamsImportPgdata),
}
#[derive(Debug)]
@@ -877,7 +879,14 @@ pub(crate) struct CreateTimelineParamsBranch {
pub(crate) ancestor_start_lsn: Option<Lsn>,
}
/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`].
#[derive(Debug)]
pub(crate) struct CreateTimelineParamsImportPgdata {
pub(crate) new_timeline_id: TimelineId,
pub(crate) location: import_pgdata::index_part_format::Location,
pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
}
/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`] in [`Tenant::start_creating_timeline`].
///
/// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
///
@@ -907,19 +916,50 @@ pub(crate) enum CreateTimelineIdempotency {
ancestor_timeline_id: TimelineId,
ancestor_start_lsn: Lsn,
},
ImportPgdata(CreatingTimelineIdempotencyImportPgdata),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct CreatingTimelineIdempotencyImportPgdata {
idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
}
/// What is returned by [`Tenant::start_creating_timeline`].
#[must_use]
enum StartCreatingTimelineResult<'t> {
CreateGuard(TimelineCreateGuard<'t>),
enum StartCreatingTimelineResult {
CreateGuard(TimelineCreateGuard),
Idempotent(Arc<Timeline>),
}
enum TimelineInitAndSyncResult {
ReadyToActivate(Arc<Timeline>),
NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
}
impl TimelineInitAndSyncResult {
fn ready_to_activate(self) -> Option<Arc<Timeline>> {
match self {
Self::ReadyToActivate(timeline) => Some(timeline),
_ => None,
}
}
}
#[must_use]
struct TimelineInitAndSyncNeedsSpawnImportPgdata {
timeline: Arc<Timeline>,
import_pgdata: import_pgdata::index_part_format::Root,
guard: TimelineCreateGuard,
}
/// What is returned by [`Tenant::create_timeline`].
enum CreateTimelineResult {
Created(Arc<Timeline>),
Idempotent(Arc<Timeline>),
/// IMPORTANT: This [`Arc<Timeline>`] object is not in [`Tenant::timelines`] when
/// we return this result, nor will this concrete object ever be added there.
/// Cf method comment on [`Tenant::create_timeline_import_pgdata`].
ImportSpawned(Arc<Timeline>),
}
impl CreateTimelineResult {
@@ -927,18 +967,19 @@ impl CreateTimelineResult {
match self {
Self::Created(_) => "Created",
Self::Idempotent(_) => "Idempotent",
Self::ImportSpawned(_) => "ImportSpawned",
}
}
fn timeline(&self) -> &Arc<Timeline> {
match self {
Self::Created(t) | Self::Idempotent(t) => t,
Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t,
}
}
/// Unit test timelines aren't activated, test has to do it if it needs to.
#[cfg(test)]
fn into_timeline_for_test(self) -> Arc<Timeline> {
match self {
Self::Created(t) | Self::Idempotent(t) => t,
Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t,
}
}
}
@@ -962,33 +1003,13 @@ pub enum CreateTimelineError {
}
#[derive(thiserror::Error, Debug)]
enum InitdbError {
Other(anyhow::Error),
pub enum InitdbError {
#[error("Operation was cancelled")]
Cancelled,
Spawn(std::io::Result<()>),
Failed(std::process::ExitStatus, Vec<u8>),
}
impl fmt::Display for InitdbError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
InitdbError::Cancelled => write!(f, "Operation was cancelled"),
InitdbError::Spawn(e) => write!(f, "Spawn error: {:?}", e),
InitdbError::Failed(status, stderr) => write!(
f,
"Command failed with status {:?}: {}",
status,
String::from_utf8_lossy(stderr)
),
InitdbError::Other(e) => write!(f, "Error: {:?}", e),
}
}
}
impl From<std::io::Error> for InitdbError {
fn from(error: std::io::Error) -> Self {
InitdbError::Spawn(Err(error))
}
#[error(transparent)]
Other(anyhow::Error),
#[error(transparent)]
Inner(postgres_initdb::Error),
}
enum CreateTimelineCause {
@@ -996,6 +1017,15 @@ enum CreateTimelineCause {
Delete,
}
enum LoadTimelineCause {
Attach,
Unoffload,
ImportPgdata {
create_guard: TimelineCreateGuard,
activate: ActivateTimelineArgs,
},
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum GcError {
// The tenant is shutting down
@@ -1072,24 +1102,35 @@ impl Tenant {
/// it is marked as Active.
#[allow(clippy::too_many_arguments)]
async fn timeline_init_and_sync(
&self,
self: &Arc<Self>,
timeline_id: TimelineId,
resources: TimelineResources,
index_part: IndexPart,
mut index_part: IndexPart,
metadata: TimelineMetadata,
ancestor: Option<Arc<Timeline>>,
_ctx: &RequestContext,
) -> anyhow::Result<()> {
cause: LoadTimelineCause,
ctx: &RequestContext,
) -> anyhow::Result<TimelineInitAndSyncResult> {
let tenant_id = self.tenant_shard_id;
let idempotency = if metadata.ancestor_timeline().is_none() {
CreateTimelineIdempotency::Bootstrap {
pg_version: metadata.pg_version(),
let import_pgdata = index_part.import_pgdata.take();
let idempotency = match &import_pgdata {
Some(import_pgdata) => {
CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata {
idempotency_key: import_pgdata.idempotency_key().clone(),
})
}
} else {
CreateTimelineIdempotency::Branch {
ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
ancestor_start_lsn: metadata.ancestor_lsn(),
None => {
if metadata.ancestor_timeline().is_none() {
CreateTimelineIdempotency::Bootstrap {
pg_version: metadata.pg_version(),
}
} else {
CreateTimelineIdempotency::Branch {
ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
ancestor_start_lsn: metadata.ancestor_lsn(),
}
}
}
};
@@ -1121,39 +1162,91 @@ impl Tenant {
format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
})?;
{
// avoiding holding it across awaits
let mut timelines_accessor = self.timelines.lock().unwrap();
match timelines_accessor.entry(timeline_id) {
// We should never try and load the same timeline twice during startup
Entry::Occupied(_) => {
unreachable!(
"Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
);
match import_pgdata {
Some(import_pgdata) if !import_pgdata.is_done() => {
match cause {
LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
LoadTimelineCause::ImportPgdata { .. } => {
unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3")
}
}
Entry::Vacant(v) => {
v.insert(Arc::clone(&timeline));
timeline.maybe_spawn_flush_loop();
let mut guard = self.timelines_creating.lock().unwrap();
if !guard.insert(timeline_id) {
// We should never try and load the same timeline twice during startup
unreachable!("Timeline {tenant_id}/{timeline_id} is already being created")
}
let timeline_create_guard = TimelineCreateGuard {
_tenant_gate_guard: self.gate.enter()?,
owning_tenant: self.clone(),
timeline_id,
idempotency,
// The users of this specific return value don't need the timline_path in there.
timeline_path: timeline
.conf
.timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id),
};
Ok(TimelineInitAndSyncResult::NeedsSpawnImportPgdata(
TimelineInitAndSyncNeedsSpawnImportPgdata {
timeline,
import_pgdata,
guard: timeline_create_guard,
},
))
}
};
Some(_) | None => {
{
let mut timelines_accessor = self.timelines.lock().unwrap();
match timelines_accessor.entry(timeline_id) {
// We should never try and load the same timeline twice during startup
Entry::Occupied(_) => {
unreachable!(
"Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
);
}
Entry::Vacant(v) => {
v.insert(Arc::clone(&timeline));
timeline.maybe_spawn_flush_loop();
}
}
}
// Sanity check: a timeline should have some content.
anyhow::ensure!(
ancestor.is_some()
|| timeline
.layers
.read()
.await
.layer_map()
.expect("currently loading, layer manager cannot be shutdown already")
.iter_historic_layers()
.next()
.is_some(),
"Timeline has no ancestor and no layer files"
);
// Sanity check: a timeline should have some content.
anyhow::ensure!(
ancestor.is_some()
|| timeline
.layers
.read()
.await
.layer_map()
.expect("currently loading, layer manager cannot be shutdown already")
.iter_historic_layers()
.next()
.is_some(),
"Timeline has no ancestor and no layer files"
);
Ok(())
match cause {
LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
LoadTimelineCause::ImportPgdata {
create_guard,
activate,
} => {
// TODO: see the comment in the task code above how I'm not so certain
// it is safe to activate here because of concurrent shutdowns.
match activate {
ActivateTimelineArgs::Yes { broker_client } => {
info!("activating timeline after reload from pgdata import task");
timeline.activate(self.clone(), broker_client, None, ctx);
}
ActivateTimelineArgs::No => (),
}
drop(create_guard);
}
}
Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline))
}
}
}
/// Attach a tenant that's available in cloud storage.
@@ -1578,24 +1671,46 @@ impl Tenant {
}
// TODO again handle early failure
self.load_remote_timeline(
timeline_id,
index_part,
remote_metadata,
TimelineResources {
remote_client,
timeline_get_throttle: self.timeline_get_throttle.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
},
ctx,
)
.await
.with_context(|| {
format!(
"failed to load remote timeline {} for tenant {}",
timeline_id, self.tenant_shard_id
let effect = self
.load_remote_timeline(
timeline_id,
index_part,
remote_metadata,
TimelineResources {
remote_client,
timeline_get_throttle: self.timeline_get_throttle.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
},
LoadTimelineCause::Attach,
ctx,
)
})?;
.await
.with_context(|| {
format!(
"failed to load remote timeline {} for tenant {}",
timeline_id, self.tenant_shard_id
)
})?;
match effect {
TimelineInitAndSyncResult::ReadyToActivate(_) => {
// activation happens later, on Tenant::activate
}
TimelineInitAndSyncResult::NeedsSpawnImportPgdata(
TimelineInitAndSyncNeedsSpawnImportPgdata {
timeline,
import_pgdata,
guard,
},
) => {
tokio::task::spawn(self.clone().create_timeline_import_pgdata_task(
timeline,
import_pgdata,
ActivateTimelineArgs::No,
guard,
));
}
}
}
// Walk through deleted timelines, resume deletion
@@ -1719,13 +1834,14 @@ impl Tenant {
#[instrument(skip_all, fields(timeline_id=%timeline_id))]
async fn load_remote_timeline(
&self,
self: &Arc<Self>,
timeline_id: TimelineId,
index_part: IndexPart,
remote_metadata: TimelineMetadata,
resources: TimelineResources,
cause: LoadTimelineCause,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> anyhow::Result<TimelineInitAndSyncResult> {
span::debug_assert_current_span_has_tenant_id();
info!("downloading index file for timeline {}", timeline_id);
@@ -1752,6 +1868,7 @@ impl Tenant {
index_part,
remote_metadata,
ancestor,
cause,
ctx,
)
.await
@@ -1938,6 +2055,7 @@ impl Tenant {
TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists"))
}
TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e),
TimelineExclusionError::ShuttingDown => TimelineArchivalError::Cancelled,
})?;
let timeline_preload = self
@@ -1976,6 +2094,7 @@ impl Tenant {
index_part,
remote_metadata,
timeline_resources,
LoadTimelineCause::Unoffload,
&ctx,
)
.await
@@ -2213,7 +2332,7 @@ impl Tenant {
///
/// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
pub(crate) async fn create_empty_timeline(
&self,
self: &Arc<Self>,
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
pg_version: u32,
@@ -2263,7 +2382,7 @@ impl Tenant {
// Our current tests don't need the background loops.
#[cfg(test)]
pub async fn create_test_timeline(
&self,
self: &Arc<Self>,
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
pg_version: u32,
@@ -2302,7 +2421,7 @@ impl Tenant {
#[cfg(test)]
#[allow(clippy::too_many_arguments)]
pub async fn create_test_timeline_with_layers(
&self,
self: &Arc<Self>,
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
pg_version: u32,
@@ -2439,6 +2558,16 @@ impl Tenant {
self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
.await?
}
CreateTimelineParams::ImportPgdata(params) => {
self.create_timeline_import_pgdata(
params,
ActivateTimelineArgs::Yes {
broker_client: broker_client.clone(),
},
ctx,
)
.await?
}
};
// At this point we have dropped our guard on [`Self::timelines_creating`], and
@@ -2481,11 +2610,202 @@ impl Tenant {
);
timeline
}
CreateTimelineResult::ImportSpawned(timeline) => {
info!("import task spawned, timeline will become visible and activated once the import is done");
timeline
}
};
Ok(activated_timeline)
}
/// The returned [`Arc<Timeline>`] is NOT in the [`Tenant::timelines`] map until the import
/// completes in the background. A DIFFERENT [`Arc<Timeline>`] will be inserted into the
/// [`Tenant::timelines`] map when the import completes.
/// We only return an [`Arc<Timeline>`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`]
/// for the response.
async fn create_timeline_import_pgdata(
self: &Arc<Tenant>,
params: CreateTimelineParamsImportPgdata,
activate: ActivateTimelineArgs,
ctx: &RequestContext,
) -> Result<CreateTimelineResult, CreateTimelineError> {
let CreateTimelineParamsImportPgdata {
new_timeline_id,
location,
idempotency_key,
} = params;
let started_at = chrono::Utc::now().naive_utc();
//
// There's probably a simpler way to upload an index part, but, remote_timeline_client
// is the canonical way we do it.
// - create an empty timeline in-memory
// - use its remote_timeline_client to do the upload
// - dispose of the uninit timeline
// - keep the creation guard alive
let timeline_create_guard = match self
.start_creating_timeline(
new_timeline_id,
CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata {
idempotency_key: idempotency_key.clone(),
}),
)
.await?
{
StartCreatingTimelineResult::CreateGuard(guard) => guard,
StartCreatingTimelineResult::Idempotent(timeline) => {
return Ok(CreateTimelineResult::Idempotent(timeline))
}
};
let mut uninit_timeline = {
let this = &self;
let initdb_lsn = Lsn(0);
let _ctx = ctx;
async move {
let new_metadata = TimelineMetadata::new(
// Initialize disk_consistent LSN to 0, The caller must import some data to
// make it valid, before calling finish_creation()
Lsn(0),
None,
None,
Lsn(0),
initdb_lsn,
initdb_lsn,
15,
);
this.prepare_new_timeline(
new_timeline_id,
&new_metadata,
timeline_create_guard,
initdb_lsn,
None,
)
.await
}
}
.await?;
let in_progress = import_pgdata::index_part_format::InProgress {
idempotency_key,
location,
started_at,
};
let index_part = import_pgdata::index_part_format::Root::V1(
import_pgdata::index_part_format::V1::InProgress(in_progress),
);
uninit_timeline
.raw_timeline()
.unwrap()
.remote_client
.schedule_index_upload_for_import_pgdata_state_update(Some(index_part.clone()))?;
// wait_completion happens in caller
let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself();
tokio::spawn(self.clone().create_timeline_import_pgdata_task(
timeline.clone(),
index_part,
activate,
timeline_create_guard,
));
// NB: the timeline doesn't exist in self.timelines at this point
Ok(CreateTimelineResult::ImportSpawned(timeline))
}
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
async fn create_timeline_import_pgdata_task(
self: Arc<Tenant>,
timeline: Arc<Timeline>,
index_part: import_pgdata::index_part_format::Root,
activate: ActivateTimelineArgs,
timeline_create_guard: TimelineCreateGuard,
) {
debug_assert_current_span_has_tenant_and_timeline_id();
info!("starting");
scopeguard::defer! {info!("exiting")};
let res = self
.create_timeline_import_pgdata_task_impl(
timeline,
index_part,
activate,
timeline_create_guard,
)
.await;
if let Err(err) = &res {
error!(?err, "task failed");
// TODO sleep & retry, sensitive to tenant shutdown
// TODO: allow timeline deletion requests => should cancel the task
}
}
async fn create_timeline_import_pgdata_task_impl(
self: Arc<Tenant>,
timeline: Arc<Timeline>,
index_part: import_pgdata::index_part_format::Root,
activate: ActivateTimelineArgs,
timeline_create_guard: TimelineCreateGuard,
) -> Result<(), anyhow::Error> {
let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn);
info!("importing pgdata");
import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone())
.await
.context("import")?;
info!("import done");
//
// Reload timeline from remote.
// This proves that the remote state is attachable, and it reuses the code.
//
// TODO: think about whether this is safe to do with concurrent Tenant::shutdown.
// timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
// But our activate() call might launch new background tasks after Tenant::shutdown
// already went past shutting down the Tenant::timelines, which this timeline here is no part of.
// I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
// down while bootstrapping/branching + activating), but, the race condition is much more likely
// to manifest because of the long runtime of this import task.
// in theory this shouldn't even .await anything except for coop yield
info!("shutting down timeline");
timeline.shutdown(ShutdownMode::Hard).await;
info!("timeline shut down, reloading from remote");
// TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc<Timeline>
// let Some(timeline) = Arc::into_inner(timeline) else {
// anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere");
// };
let timeline_id = timeline.timeline_id;
// load from object storage like Tenant::attach does
let resources = self.build_timeline_resources(timeline_id);
let index_part = resources
.remote_client
.download_index_file(&self.cancel)
.await?;
let index_part = match index_part {
MaybeDeletedIndexPart::Deleted(_) => {
// likely concurrent delete call, cplane should prevent this
anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but")
}
MaybeDeletedIndexPart::IndexPart(p) => p,
};
let metadata = index_part.metadata.clone();
self
.load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{
create_guard: timeline_create_guard, activate, }, &ctx)
.await?
.ready_to_activate()
.context("implementation error: reloaded timeline still needs import after import reported success")?;
anyhow::Ok(())
}
pub(crate) async fn delete_timeline(
self: Arc<Self>,
timeline_id: TimelineId,
@@ -2895,6 +3215,18 @@ impl Tenant {
}
}
if let ShutdownMode::Reload = shutdown_mode {
tracing::info!("Flushing deletion queue");
if let Err(e) = self.deletion_queue_client.flush().await {
match e {
DeletionQueueError::ShuttingDown => {
// This is the only error we expect for now. In the future, if more error
// variants are added, we should handle them here.
}
}
}
}
// We cancel the Tenant's cancellation token _after_ the timelines have all shut down. This permits
// them to continue to do work during their shutdown methods, e.g. flushing data.
tracing::debug!("Cancelling CancellationToken");
@@ -3337,6 +3669,13 @@ where
Ok(result)
}
enum ActivateTimelineArgs {
Yes {
broker_client: storage_broker::BrokerClientChannel,
},
No,
}
impl Tenant {
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
self.tenant_conf.load().tenant_conf.clone()
@@ -3520,6 +3859,7 @@ impl Tenant {
/// `validate_ancestor == false` is used when a timeline is created for deletion
/// and we might not have the ancestor present anymore which is fine for to be
/// deleted timelines.
#[allow(clippy::too_many_arguments)]
fn create_timeline_struct(
&self,
new_timeline_id: TimelineId,
@@ -4283,16 +4623,17 @@ impl Tenant {
/// If the timeline was already created in the meantime, we check whether this
/// request conflicts or is idempotent , based on `state`.
async fn start_creating_timeline(
&self,
self: &Arc<Self>,
new_timeline_id: TimelineId,
idempotency: CreateTimelineIdempotency,
) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
) -> Result<StartCreatingTimelineResult, CreateTimelineError> {
let allow_offloaded = false;
match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) {
Ok(create_guard) => {
pausable_failpoint!("timeline-creation-after-uninit");
Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
}
Err(TimelineExclusionError::ShuttingDown) => Err(CreateTimelineError::ShuttingDown),
Err(TimelineExclusionError::AlreadyCreating) => {
// Creation is in progress, we cannot create it again, and we cannot
// check if this request matches the existing one, so caller must try
@@ -4582,7 +4923,7 @@ impl Tenant {
&'a self,
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
create_guard: TimelineCreateGuard<'a>,
create_guard: TimelineCreateGuard,
start_lsn: Lsn,
ancestor: Option<Arc<Timeline>>,
) -> anyhow::Result<UninitializedTimeline<'a>> {
@@ -4642,7 +4983,7 @@ impl Tenant {
/// The `allow_offloaded` parameter controls whether to tolerate the existence of
/// offloaded timelines or not.
fn create_timeline_create_guard(
&self,
self: &Arc<Self>,
timeline_id: TimelineId,
idempotency: CreateTimelineIdempotency,
allow_offloaded: bool,
@@ -4902,48 +5243,16 @@ async fn run_initdb(
let _permit = INIT_DB_SEMAPHORE.acquire().await;
let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
initdb_command
.args(["--pgdata", initdb_target_dir.as_ref()])
.args(["--username", &conf.superuser])
.args(["--encoding", "utf8"])
.args(["--locale", &conf.locale])
.arg("--no-instructions")
.arg("--no-sync")
.env_clear()
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
.stdin(std::process::Stdio::null())
// stdout invocation produces the same output every time, we don't need it
.stdout(std::process::Stdio::null())
// we would be interested in the stderr output, if there was any
.stderr(std::process::Stdio::piped());
// Before version 14, only the libc provide was available.
if pg_version > 14 {
// Version 17 brought with it a builtin locale provider which only provides
// C and C.UTF-8. While being safer for collation purposes since it is
// guaranteed to be consistent throughout a major release, it is also more
// performant.
let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
initdb_command.args(["--locale-provider", locale_provider]);
}
let initdb_proc = initdb_command.spawn()?;
// Ideally we'd select here with the cancellation token, but the problem is that
// we can't safely terminate initdb: it launches processes of its own, and killing
// initdb doesn't kill them. After we return from this function, we want the target
// directory to be able to be cleaned up.
// See https://github.com/neondatabase/neon/issues/6385
let initdb_output = initdb_proc.wait_with_output().await?;
if !initdb_output.status.success() {
return Err(InitdbError::Failed(
initdb_output.status,
initdb_output.stderr,
));
}
let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
superuser: &conf.superuser,
locale: &conf.locale,
initdb_bin: &initdb_bin_path,
pg_version,
library_search_path: &initdb_lib_dir,
pgdata: initdb_target_dir,
})
.await
.map_err(InitdbError::Inner);
// This isn't true cancellation support, see above. Still return an error to
// excercise the cancellation code path.
@@ -4951,7 +5260,7 @@ async fn run_initdb(
return Err(InitdbError::Cancelled);
}
Ok(())
res
}
/// Dump contents of a layer file to stdout.
@@ -5047,6 +5356,7 @@ pub(crate) mod harness {
lsn_lease_length: Some(tenant_conf.lsn_lease_length),
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
timeline_offloading: Some(tenant_conf.timeline_offloading),
wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
}
}
}

View File

@@ -19,6 +19,7 @@ use serde_json::Value;
use std::num::NonZeroU64;
use std::time::Duration;
use utils::generation::Generation;
use utils::postgres_client::PostgresClientProtocol;
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub(crate) enum AttachmentMode {
@@ -353,6 +354,9 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub timeline_offloading: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
}
impl TenantConfOpt {
@@ -418,6 +422,9 @@ impl TenantConfOpt {
timeline_offloading: self
.lazy_slru_download
.unwrap_or(global_conf.timeline_offloading),
wal_receiver_protocol_override: self
.wal_receiver_protocol_override
.or(global_conf.wal_receiver_protocol_override),
}
}
}
@@ -472,6 +479,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
lsn_lease_length: value.lsn_lease_length.map(humantime),
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
timeline_offloading: value.timeline_offloading,
wal_receiver_protocol_override: value.wal_receiver_protocol_override,
}
}
}

View File

@@ -1960,7 +1960,7 @@ impl TenantManager {
attempt.before_reset_tenant();
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, ShutdownMode::Flush).await {
match tenant.shutdown(progress, ShutdownMode::Reload).await {
Ok(()) => {
slot_guard.drop_old_value().expect("it was just shutdown");
}

View File

@@ -199,7 +199,7 @@ use utils::backoff::{
use utils::pausable_failpoint;
use utils::shard::ShardNumber;
use std::collections::{HashMap, VecDeque};
use std::collections::{HashMap, HashSet, VecDeque};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex, OnceLock};
use std::time::Duration;
@@ -223,7 +223,7 @@ use crate::task_mgr::shutdown_token;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::download::download_retry;
use crate::tenant::storage_layer::AsLayerDesc;
use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable};
use crate::tenant::TIMELINES_SEGMENT_NAME;
use crate::{
config::PageServerConf,
@@ -244,6 +244,7 @@ use self::index::IndexPart;
use super::config::AttachedLocationConfig;
use super::metadata::MetadataUpdate;
use super::storage_layer::{Layer, LayerName, ResidentLayer};
use super::timeline::import_pgdata;
use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
use super::{DeleteTimelineError, Generation};
@@ -813,6 +814,18 @@ impl RemoteTimelineClient {
Ok(need_wait)
}
/// Launch an index-file upload operation in the background, setting `import_pgdata` field.
pub(crate) fn schedule_index_upload_for_import_pgdata_state_update(
self: &Arc<Self>,
state: Option<import_pgdata::index_part_format::Root>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.dirty.import_pgdata = state;
self.schedule_index_upload(upload_queue)?;
Ok(())
}
///
/// Launch an index-file upload operation in the background, if necessary.
///
@@ -1090,7 +1103,7 @@ impl RemoteTimelineClient {
"scheduled layer file upload {layer}",
);
let op = UploadOp::UploadLayer(layer, metadata);
let op = UploadOp::UploadLayer(layer, metadata, None);
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
}
@@ -1805,7 +1818,7 @@ impl RemoteTimelineClient {
// have finished.
upload_queue.inprogress_tasks.is_empty()
}
UploadOp::Delete(_) => {
UploadOp::Delete(..) => {
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
}
@@ -1833,19 +1846,32 @@ impl RemoteTimelineClient {
}
// We can launch this task. Remove it from the queue first.
let next_op = upload_queue.queued_operations.pop_front().unwrap();
let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
debug!("starting op: {}", next_op);
// Update the counters
match next_op {
UploadOp::UploadLayer(_, _) => {
// Update the counters and prepare
match &mut next_op {
UploadOp::UploadLayer(layer, meta, mode) => {
if upload_queue
.recently_deleted
.remove(&(layer.layer_desc().layer_name().clone(), meta.generation))
{
*mode = Some(OpType::FlushDeletion);
} else {
*mode = Some(OpType::MayReorder)
}
upload_queue.num_inprogress_layer_uploads += 1;
}
UploadOp::UploadMetadata { .. } => {
upload_queue.num_inprogress_metadata_uploads += 1;
}
UploadOp::Delete(_) => {
UploadOp::Delete(Delete { layers }) => {
for (name, meta) in layers {
upload_queue
.recently_deleted
.insert((name.clone(), meta.generation));
}
upload_queue.num_inprogress_deletions += 1;
}
UploadOp::Barrier(sender) => {
@@ -1921,7 +1947,66 @@ impl RemoteTimelineClient {
}
let upload_result: anyhow::Result<()> = match &task.op {
UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
if let Some(OpType::FlushDeletion) = mode {
if self.config.read().unwrap().block_deletions {
// Of course, this is not efficient... but usually the queue should be empty.
let mut queue_locked = self.upload_queue.lock().unwrap();
let mut detected = false;
if let Ok(queue) = queue_locked.initialized_mut() {
for list in queue.blocked_deletions.iter_mut() {
list.layers.retain(|(name, meta)| {
if name == &layer.layer_desc().layer_name()
&& meta.generation == layer_metadata.generation
{
detected = true;
// remove the layer from deletion queue
false
} else {
// keep the layer
true
}
});
}
}
if detected {
info!(
"cancelled blocked deletion of layer {} at gen {:?}",
layer.layer_desc().layer_name(),
layer_metadata.generation
);
}
} else {
// TODO: we did not guarantee that upload task starts after deletion task, so there could be possibly race conditions
// that we still get the layer deleted. But this only happens if someone creates a layer immediately after it's deleted,
// which is not possible in the current system.
info!(
"waiting for deletion queue flush to complete before uploading layer {} at gen {:?}",
layer.layer_desc().layer_name(),
layer_metadata.generation
);
{
// We are going to flush, we can clean up the recently deleted list.
let mut queue_locked = self.upload_queue.lock().unwrap();
if let Ok(queue) = queue_locked.initialized_mut() {
queue.recently_deleted.clear();
}
}
if let Err(e) = self.deletion_queue_client.flush_execute().await {
warn!(
"failed to flush the deletion queue before uploading layer {} at gen {:?}, still proceeding to upload: {e:#} ",
layer.layer_desc().layer_name(),
layer_metadata.generation
);
} else {
info!(
"done flushing deletion queue before uploading layer {} at gen {:?}",
layer.layer_desc().layer_name(),
layer_metadata.generation
);
}
}
}
let local_path = layer.local_path();
// We should only be uploading layers created by this `Tenant`'s lifetime, so
@@ -2085,7 +2170,7 @@ impl RemoteTimelineClient {
upload_queue.inprogress_tasks.remove(&task.task_id);
let lsn_update = match task.op {
UploadOp::UploadLayer(_, _) => {
UploadOp::UploadLayer(_, _, _) => {
upload_queue.num_inprogress_layer_uploads -= 1;
None
}
@@ -2162,7 +2247,7 @@ impl RemoteTimelineClient {
)> {
use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
let res = match op {
UploadOp::UploadLayer(_, m) => (
UploadOp::UploadLayer(_, m, _) => (
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
@@ -2259,6 +2344,7 @@ impl RemoteTimelineClient {
blocked_deletions: Vec::new(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
recently_deleted: HashSet::new(),
};
let upload_queue = std::mem::replace(

View File

@@ -706,7 +706,7 @@ where
.and_then(|x| x)
}
async fn download_retry_forever<T, O, F>(
pub(crate) async fn download_retry_forever<T, O, F>(
op: O,
description: &str,
cancel: &CancellationToken,

View File

@@ -12,6 +12,7 @@ use utils::id::TimelineId;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::timeline::import_pgdata;
use crate::tenant::Generation;
use pageserver_api::shard::ShardIndex;
@@ -37,6 +38,13 @@ pub struct IndexPart {
#[serde(skip_serializing_if = "Option::is_none")]
pub archived_at: Option<NaiveDateTime>,
/// This field supports import-from-pgdata ("fast imports" platform feature).
/// We don't currently use fast imports, so, this field is None for all production timelines.
/// See <https://github.com/neondatabase/neon/pull/9218> for more information.
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub import_pgdata: Option<import_pgdata::index_part_format::Root>,
/// Per layer file name metadata, which can be present for a present or missing layer file.
///
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -90,10 +98,11 @@ impl IndexPart {
/// - 7: metadata_bytes is no longer written, but still read
/// - 8: added `archived_at`
/// - 9: +gc_blocking
const LATEST_VERSION: usize = 9;
/// - 10: +import_pgdata
const LATEST_VERSION: usize = 10;
// Versions we may see when reading from a bucket.
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
pub const FILE_NAME: &'static str = "index_part.json";
@@ -108,6 +117,7 @@ impl IndexPart {
lineage: Default::default(),
gc_blocking: None,
last_aux_file_policy: None,
import_pgdata: None,
}
}
@@ -381,6 +391,7 @@ mod tests {
lineage: Lineage::default(),
gc_blocking: None,
last_aux_file_policy: None,
import_pgdata: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -425,6 +436,7 @@ mod tests {
lineage: Lineage::default(),
gc_blocking: None,
last_aux_file_policy: None,
import_pgdata: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -470,6 +482,7 @@ mod tests {
lineage: Lineage::default(),
gc_blocking: None,
last_aux_file_policy: None,
import_pgdata: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -518,6 +531,7 @@ mod tests {
lineage: Lineage::default(),
gc_blocking: None,
last_aux_file_policy: None,
import_pgdata: None,
};
let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -561,6 +575,7 @@ mod tests {
lineage: Lineage::default(),
gc_blocking: None,
last_aux_file_policy: None,
import_pgdata: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -607,6 +622,7 @@ mod tests {
},
gc_blocking: None,
last_aux_file_policy: None,
import_pgdata: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -658,6 +674,7 @@ mod tests {
},
gc_blocking: None,
last_aux_file_policy: Some(AuxFilePolicy::V2),
import_pgdata: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -714,6 +731,7 @@ mod tests {
lineage: Default::default(),
gc_blocking: None,
last_aux_file_policy: Default::default(),
import_pgdata: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -771,6 +789,7 @@ mod tests {
lineage: Default::default(),
gc_blocking: None,
last_aux_file_policy: Default::default(),
import_pgdata: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -833,6 +852,83 @@ mod tests {
}),
last_aux_file_policy: Default::default(),
archived_at: None,
import_pgdata: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
#[test]
fn v10_importpgdata_is_parsed() {
let example = r#"{
"version": 10,
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata": {
"disk_consistent_lsn": "0/16960E8",
"prev_record_lsn": "0/1696070",
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
"ancestor_lsn": "0/0",
"latest_gc_cutoff_lsn": "0/1696070",
"initdb_lsn": "0/1696070",
"pg_version": 14
},
"gc_blocking": {
"started_at": "2024-07-19T09:00:00.123",
"reasons": ["DetachAncestor"]
},
"import_pgdata": {
"V1": {
"Done": {
"idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
"started_at": "2024-11-13T09:23:42.123",
"finished_at": "2024-11-13T09:42:23.123"
}
}
}
}"#;
let expected = IndexPart {
version: 10,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
file_size: 9007199254741001,
generation: Generation::none(),
shard: ShardIndex::unsharded()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::new(
Lsn::from_str("0/16960E8").unwrap(),
Some(Lsn::from_str("0/1696070").unwrap()),
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
).with_recalculated_checksum().unwrap(),
deleted_at: None,
lineage: Default::default(),
gc_blocking: Some(GcBlocking {
started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
}),
last_aux_file_policy: Default::default(),
archived_at: None,
import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
})))
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();

View File

@@ -111,15 +111,6 @@ pub(crate) struct SecondaryTenant {
pub(super) heatmap_total_size_metric: UIntGauge,
}
impl Drop for SecondaryTenant {
fn drop(&mut self) {
let tenant_id = self.tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
}
}
impl SecondaryTenant {
pub(crate) fn new(
tenant_shard_id: TenantShardId,
@@ -167,6 +158,13 @@ impl SecondaryTenant {
// Wait for any secondary downloader work to complete
self.gate.close().await;
self.validate_metrics();
let tenant_id = self.tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
}
pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
@@ -254,6 +252,20 @@ impl SecondaryTenant {
.await
.expect("secondary eviction should not have panicked");
}
/// Exhaustive check that incrementally updated metrics match the actual state.
#[cfg(feature = "testing")]
fn validate_metrics(&self) {
let detail = self.detail.lock().unwrap();
let resident_size = detail.total_resident_size();
assert_eq!(resident_size, self.resident_size_metric.get());
}
#[cfg(not(feature = "testing"))]
fn validate_metrics(&self) {
// No-op in non-testing builds
}
}
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,

View File

@@ -242,6 +242,19 @@ impl SecondaryDetail {
}
}
#[cfg(feature = "testing")]
pub(crate) fn total_resident_size(&self) -> u64 {
self.timelines
.values()
.map(|tl| {
tl.on_disk_layers
.values()
.map(|v| v.metadata.file_size)
.sum::<u64>()
})
.sum::<u64>()
}
pub(super) fn evict_layer(
&mut self,
name: LayerName,
@@ -763,24 +776,7 @@ impl<'a> TenantDownloader<'a> {
}
// Metrics consistency check in testing builds
if cfg!(feature = "testing") {
let detail = self.secondary_state.detail.lock().unwrap();
let resident_size = detail
.timelines
.values()
.map(|tl| {
tl.on_disk_layers
.values()
.map(|v| v.metadata.file_size)
.sum::<u64>()
})
.sum::<u64>();
assert_eq!(
resident_size,
self.secondary_state.resident_size_metric.get()
);
}
self.secondary_state.validate_metrics();
// Only update last_etag after a full successful download: this way will not skip
// the next download, even if the heatmap's actual etag is unchanged.
self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {

View File

@@ -4,6 +4,7 @@ pub mod delete;
pub(crate) mod detach_ancestor;
mod eviction_task;
pub(crate) mod handle;
pub(crate) mod import_pgdata;
mod init;
pub mod layer_manager;
pub(crate) mod logical_size;
@@ -49,6 +50,7 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::{
fs_ext, pausable_failpoint,
postgres_client::PostgresClientProtocol,
sync::gate::{Gate, GateGuard},
};
use wal_decoder::serialized_batch::SerializedValueBatch;
@@ -892,10 +894,11 @@ pub(crate) enum ShutdownMode {
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
/// the call to [`Timeline::shutdown`].
FreezeAndFlush,
/// Only flush the layers to the remote storage without freezing any open layers. This is the
/// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
/// the generation number.
Flush,
/// Only flush the layers to the remote storage without freezing any open layers. Flush the deletion
/// queue. This is the mode used by ancestor detach and any other operations that reloads a tenant
/// but not increasing the generation number. Note that this mode cannot be used at tenant shutdown,
/// as flushing the deletion queue at that time will cause shutdown-in-progress errors.
Reload,
/// Shut down immediately, without waiting for any open layers to flush.
Hard,
}
@@ -1816,7 +1819,7 @@ impl Timeline {
}
}
if let ShutdownMode::Flush = mode {
if let ShutdownMode::Reload = mode {
// drain the upload queue
self.remote_client.shutdown().await;
if !self.remote_client.no_pending_work() {
@@ -2085,6 +2088,11 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
}
pub(crate) fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf.is_gc_blocked_by_lsn_lease_deadline()
}
pub(crate) fn get_lazy_slru_download(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2172,6 +2180,21 @@ impl Timeline {
)
}
/// Resolve the effective WAL receiver protocol to use for this tenant.
///
/// Priority order is:
/// 1. Tenant config override
/// 2. Default value for tenant config override
/// 3. Pageserver config override
/// 4. Pageserver config default
pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.wal_receiver_protocol_override
.or(self.conf.default_tenant_conf.wal_receiver_protocol_override)
.unwrap_or(self.conf.wal_receiver_protocol)
}
pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
// NB: Most tenant conf options are read by background loops, so,
// changes will automatically be picked up.
@@ -2464,6 +2487,7 @@ impl Timeline {
*guard = Some(WalReceiver::start(
Arc::clone(self),
WalReceiverConf {
protocol: self.resolve_wal_receiver_protocol(),
wal_connect_timeout,
lagging_wal_timeout,
max_lsn_wal_lag,
@@ -2647,6 +2671,7 @@ impl Timeline {
//
// NB: generation numbers naturally protect against this because they disambiguate
// (1) and (4)
// TODO: this is basically a no-op now, should we remove it?
self.remote_client.schedule_barrier()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
@@ -2702,20 +2727,23 @@ impl Timeline {
{
Some(cancel) => cancel.cancel(),
None => {
let state = self.current_state();
if matches!(
state,
TimelineState::Broken { .. } | TimelineState::Stopping
) {
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
// Don't make noise.
} else {
warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
debug_assert!(false);
match self.current_state() {
TimelineState::Broken { .. } | TimelineState::Stopping => {
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
// Don't make noise.
}
TimelineState::Loading => {
// Import does not return an activated timeline.
info!("discarding priority boost for logical size calculation because timeline is not yet active");
}
TimelineState::Active => {
// activation should be setting the once cell
warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
debug_assert!(false);
}
}
}
};
}
}
}
@@ -3819,7 +3847,8 @@ impl Timeline {
};
// Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
// This makes us refuse ingest until the new layers have been persisted to the remote.
// This makes us refuse ingest until the new layers have been persisted to the remote
let start = Instant::now();
self.remote_client
.wait_completion()
.await
@@ -3832,6 +3861,8 @@ impl Timeline {
FlushLayerError::Other(anyhow!(e).into())
}
})?;
let duration = start.elapsed().as_secs_f64();
self.metrics.flush_wait_upload_time_gauge_add(duration);
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
// a compaction can delete the file and then it won't be available for uploads any more.
@@ -5886,7 +5917,7 @@ impl<'a> TimelineWriter<'a> {
batch: SerializedValueBatch,
ctx: &RequestContext,
) -> anyhow::Result<()> {
if batch.is_empty() {
if !batch.has_data() {
return Ok(());
}

View File

@@ -0,0 +1,218 @@
use std::sync::Arc;
use anyhow::{bail, Context};
use remote_storage::RemotePath;
use tokio_util::sync::CancellationToken;
use tracing::{info, info_span, Instrument};
use utils::lsn::Lsn;
use crate::{context::RequestContext, tenant::metadata::TimelineMetadata};
use super::Timeline;
mod flow;
mod importbucket_client;
mod importbucket_format;
pub(crate) mod index_part_format;
pub(crate) mod upcall_api;
pub async fn doit(
timeline: &Arc<Timeline>,
index_part: index_part_format::Root,
ctx: &RequestContext,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let index_part_format::Root::V1(v1) = index_part;
let index_part_format::InProgress {
location,
idempotency_key,
started_at,
} = match v1 {
index_part_format::V1::Done(_) => return Ok(()),
index_part_format::V1::InProgress(in_progress) => in_progress,
};
let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;
info!("get spec early so we know we'll be able to upcall when done");
let Some(spec) = storage.get_spec().await? else {
bail!("spec not found")
};
let upcall_client =
upcall_api::Client::new(timeline.conf, cancel.clone()).context("create upcall client")?;
//
// send an early progress update to clean up k8s job early and generate potentially useful logs
//
info!("send early progress update");
upcall_client
.send_progress_until_success(&spec)
.instrument(info_span!("early_progress_update"))
.await?;
let status_prefix = RemotePath::from_string("status").unwrap();
//
// See if shard is done.
// TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing.
//
let shard_status_key =
status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug()));
let shard_status: Option<importbucket_format::ShardStatus> =
storage.get_json(&shard_status_key).await?;
info!(?shard_status, "peeking shard status");
if shard_status.map(|st| st.done).unwrap_or(false) {
info!("shard status indicates that the shard is done, skipping import");
} else {
// TODO: checkpoint the progress into the IndexPart instead of restarting
// from the beginning.
//
// Wipe the slate clean - the flow does not allow resuming.
// We can implement resuming in the future by checkpointing the progress into the IndexPart.
//
info!("wipe the slate clean");
{
// TODO: do we need to hold GC lock for this?
let mut guard = timeline.layers.write().await;
assert!(
guard.layer_map()?.open_layer.is_none(),
"while importing, there should be no in-memory layer" // this just seems like a good place to assert it
);
let all_layers_keys = guard.all_persistent_layers();
let all_layers: Vec<_> = all_layers_keys
.iter()
.map(|key| guard.get_from_key(key))
.collect();
let open = guard.open_mut().context("open_mut")?;
timeline.remote_client.schedule_gc_update(&all_layers)?;
open.finish_gc_timeline(&all_layers);
}
//
// Wait for pgdata to finish uploading
//
info!("wait for pgdata to reach status 'done'");
let pgdata_status_key = status_prefix.join("pgdata");
loop {
let res = async {
let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
.get_json(&pgdata_status_key)
.await
.context("get pgdata status")?;
info!(?pgdata_status, "peeking pgdata status");
if pgdata_status.map(|st| st.done).unwrap_or(false) {
Ok(())
} else {
Err(anyhow::anyhow!("pgdata not done yet"))
}
}
.await;
match res {
Ok(_) => break,
Err(err) => {
info!(?err, "indefintely waiting for pgdata to finish");
if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
.await
.is_ok()
{
bail!("cancelled while waiting for pgdata");
}
}
}
}
//
// Do the import
//
info!("do the import");
let control_file = storage.get_control_file().await?;
let base_lsn = control_file.base_lsn();
info!("update TimelineMetadata based on LSNs from control file");
{
let pg_version = control_file.pg_version();
let _ctx: &RequestContext = ctx;
async move {
// FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
// checkpoint record, and prev_record_lsn should point to its beginning.
// We should read the real end of the record from the WAL, but here we
// just fake it.
let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
let prev_record_lsn = base_lsn;
let metadata = TimelineMetadata::new(
disk_consistent_lsn,
Some(prev_record_lsn),
None, // no ancestor
Lsn(0), // no ancestor lsn
base_lsn, // latest_gc_cutoff_lsn
base_lsn, // initdb_lsn
pg_version,
);
let _start_lsn = disk_consistent_lsn + 1;
timeline
.remote_client
.schedule_index_upload_for_full_metadata_update(&metadata)?;
timeline.remote_client.wait_completion().await?;
anyhow::Ok(())
}
}
.await?;
flow::run(
timeline.clone(),
base_lsn,
control_file,
storage.clone(),
ctx,
)
.await?;
//
// Communicate that shard is done.
//
storage
.put_json(
&shard_status_key,
&importbucket_format::ShardStatus { done: true },
)
.await
.context("put shard status")?;
}
//
// Ensure at-least-once deliver of the upcall to cplane
// before we mark the task as done and never come here again.
//
info!("send final progress update");
upcall_client
.send_progress_until_success(&spec)
.instrument(info_span!("final_progress_update"))
.await?;
//
// Mark as done in index_part.
// This makes subsequent timeline loads enter the normal load code path
// instead of spawning the import task and calling this here function.
//
info!("mark import as complete in index part");
timeline
.remote_client
.schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1(
index_part_format::V1::Done(index_part_format::Done {
idempotency_key,
started_at,
finished_at: chrono::Utc::now().naive_utc(),
}),
)))?;
timeline.remote_client.wait_completion().await?;
Ok(())
}

View File

@@ -0,0 +1,798 @@
//! Import a PGDATA directory into an empty root timeline.
//!
//! This module is adapted hackathon code by Heikki and Stas.
//! Other code in the parent module was written by Christian as part of a customer PoC.
//!
//! The hackathon code was producing image layer files as a free-standing program.
//!
//! It has been modified to
//! - run inside a running Pageserver, within the proper lifecycles of Timeline -> Tenant(Shard)
//! - => sharding-awareness: produce image layers with only the data relevant for this shard
//! - => S3 as the source for the PGDATA instead of local filesystem
//!
//! TODOs before productionization:
//! - ChunkProcessingJob size / ImportJob::total_size does not account for sharding.
//! => produced image layers likely too small.
//! - ChunkProcessingJob should cut up an ImportJob to hit exactly target image layer size.
//! - asserts / unwraps need to be replaced with errors
//! - don't trust remote objects will be small (=prevent OOMs in those cases)
//! - limit all in-memory buffers in size, or download to disk and read from there
//! - limit task concurrency
//! - generally play nice with other tenants in the system
//! - importbucket is different bucket than main pageserver storage, so, should be fine wrt S3 rate limits
//! - but concerns like network bandwidth, local disk write bandwidth, local disk capacity, etc
//! - integrate with layer eviction system
//! - audit for Tenant::cancel nor Timeline::cancel responsivity
//! - audit for Tenant/Timeline gate holding (we spawn tokio tasks during this flow!)
//!
//! An incomplete set of TODOs from the Hackathon:
//! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest)
use std::sync::Arc;
use anyhow::{bail, ensure};
use bytes::Bytes;
use itertools::Itertools;
use pageserver_api::{
key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY},
reltag::RelTag,
shard::ShardIdentity,
};
use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ};
use tokio::task::JoinSet;
use tracing::{debug, info_span, instrument, Instrument};
use crate::{
assert_u64_eq_usize::UsizeIsU64,
pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory},
};
use crate::{
context::{DownloadBehavior, RequestContext},
pgdatadir_mapping::{DbDirectory, RelDirectory},
task_mgr::TaskKind,
tenant::storage_layer::{ImageLayerWriter, Layer},
};
use pageserver_api::key::Key;
use pageserver_api::key::{
slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY,
TWOPHASEDIR_KEY,
};
use pageserver_api::keyspace::singleton_range;
use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range};
use pageserver_api::reltag::SlruKind;
use utils::bin_ser::BeSer;
use utils::lsn::Lsn;
use std::collections::HashSet;
use std::ops::Range;
use super::{
importbucket_client::{ControlFile, RemoteStorageWrapper},
Timeline,
};
use remote_storage::RemotePath;
pub async fn run(
timeline: Arc<Timeline>,
pgdata_lsn: Lsn,
control_file: ControlFile,
storage: RemoteStorageWrapper,
ctx: &RequestContext,
) -> anyhow::Result<()> {
Flow {
timeline,
pgdata_lsn,
control_file,
tasks: Vec::new(),
storage,
}
.run(ctx)
.await
}
struct Flow {
timeline: Arc<Timeline>,
pgdata_lsn: Lsn,
control_file: ControlFile,
tasks: Vec<AnyImportTask>,
storage: RemoteStorageWrapper,
}
impl Flow {
/// Perform the ingestion into [`Self::timeline`].
/// Assumes the timeline is empty (= no layers).
pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> {
let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align();
self.pgdata_lsn = pgdata_lsn;
let datadir = PgDataDir::new(&self.storage).await?;
// Import dbdir (00:00:00 keyspace)
// This is just constructed here, but will be written to the image layer in the first call to import_db()
let dbdir_buf = Bytes::from(DbDirectory::ser(&DbDirectory {
dbdirs: datadir
.dbs
.iter()
.map(|db| ((db.spcnode, db.dboid), true))
.collect(),
})?);
self.tasks
.push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into());
// Import databases (00:spcnode:dbnode keyspace for each db)
for db in datadir.dbs {
self.import_db(&db).await?;
}
// Import SLRUs
// pg_xact (01:00 keyspace)
self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
.await?;
// pg_multixact/members (01:01 keyspace)
self.import_slru(
SlruKind::MultiXactMembers,
&self.storage.pgdata().join("pg_multixact/members"),
)
.await?;
// pg_multixact/offsets (01:02 keyspace)
self.import_slru(
SlruKind::MultiXactOffsets,
&self.storage.pgdata().join("pg_multixact/offsets"),
)
.await?;
// Import pg_twophase.
// TODO: as empty
let twophasedir_buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
xids: HashSet::new(),
})?;
self.tasks
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
TWOPHASEDIR_KEY,
Bytes::from(twophasedir_buf),
)));
// Controlfile, checkpoint
self.tasks
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
CONTROLFILE_KEY,
self.control_file.control_file_buf().clone(),
)));
let checkpoint_buf = self
.control_file
.control_file_data()
.checkPointCopy
.encode()?;
self.tasks
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
CHECKPOINT_KEY,
checkpoint_buf,
)));
// Assigns parts of key space to later parallel jobs
let mut last_end_key = Key::MIN;
let mut current_chunk = Vec::new();
let mut current_chunk_size: usize = 0;
let mut parallel_jobs = Vec::new();
for task in std::mem::take(&mut self.tasks).into_iter() {
if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 {
let key_range = last_end_key..task.key_range().start;
parallel_jobs.push(ChunkProcessingJob::new(
key_range.clone(),
std::mem::take(&mut current_chunk),
&self,
));
last_end_key = key_range.end;
current_chunk_size = 0;
}
current_chunk_size += task.total_size();
current_chunk.push(task);
}
parallel_jobs.push(ChunkProcessingJob::new(
last_end_key..Key::MAX,
current_chunk,
&self,
));
// Start all jobs simultaneosly
let mut work = JoinSet::new();
// TODO: semaphore?
for job in parallel_jobs {
let ctx: RequestContext =
ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);
work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job")));
}
let mut results = Vec::new();
while let Some(result) = work.join_next().await {
match result {
Ok(res) => {
results.push(res);
}
Err(_joinset_err) => {
results.push(Err(anyhow::anyhow!(
"parallel job panicked or cancelled, check pageserver logs"
)));
}
}
}
if results.iter().all(|r| r.is_ok()) {
Ok(())
} else {
let mut msg = String::new();
for result in results {
if let Err(err) = result {
msg.push_str(&format!("{err:?}\n\n"));
}
}
bail!("Some parallel jobs failed:\n\n{msg}");
}
}
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))]
async fn import_db(&mut self, db: &PgDataDirDb) -> anyhow::Result<()> {
debug!("start");
scopeguard::defer! {
debug!("return");
}
// Import relmap (00:spcnode:dbnode:00:*:00)
let relmap_key = relmap_file_key(db.spcnode, db.dboid);
debug!("Constructing relmap entry, key {relmap_key}");
let relmap_path = db.path.join("pg_filenode.map");
let relmap_buf = self.storage.get(&relmap_path).await?;
self.tasks
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
relmap_key, relmap_buf,
)));
// Import reldir (00:spcnode:dbnode:00:*:01)
let reldir_key = rel_dir_to_key(db.spcnode, db.dboid);
debug!("Constructing reldirs entry, key {reldir_key}");
let reldir_buf = RelDirectory::ser(&RelDirectory {
rels: db
.files
.iter()
.map(|f| (f.rel_tag.relnode, f.rel_tag.forknum))
.collect(),
})?;
self.tasks
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
reldir_key,
Bytes::from(reldir_buf),
)));
// Import data (00:spcnode:dbnode:reloid:fork:blk) and set sizes for each last
// segment in a given relation (00:spcnode:dbnode:reloid:fork:ff)
for file in &db.files {
debug!(%file.path, %file.filesize, "importing file");
let len = file.filesize;
ensure!(len % 8192 == 0);
let start_blk: u32 = file.segno * (1024 * 1024 * 1024 / 8192);
let start_key = rel_block_to_key(file.rel_tag, start_blk);
let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32);
self.tasks
.push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new(
*self.timeline.get_shard_identity(),
start_key..end_key,
&file.path,
self.storage.clone(),
)));
// Set relsize for the last segment (00:spcnode:dbnode:reloid:fork:ff)
if let Some(nblocks) = file.nblocks {
let size_key = rel_size_to_key(file.rel_tag);
//debug!("Setting relation size (path={path}, rel_tag={rel_tag}, segno={segno}) to {nblocks}, key {size_key}");
let buf = nblocks.to_le_bytes();
self.tasks
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
size_key,
Bytes::from(buf.to_vec()),
)));
}
}
Ok(())
}
async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> {
let segments = self.storage.listfilesindir(path).await?;
let segments: Vec<(String, u32, usize)> = segments
.into_iter()
.filter_map(|(path, size)| {
let filename = path.object_name()?;
let segno = u32::from_str_radix(filename, 16).ok()?;
Some((filename.to_string(), segno, size))
})
.collect();
// Write SlruDir
let slrudir_key = slru_dir_to_key(kind);
let segnos: HashSet<u32> = segments
.iter()
.map(|(_path, segno, _size)| *segno)
.collect();
let slrudir = SlruSegmentDirectory { segments: segnos };
let slrudir_buf = SlruSegmentDirectory::ser(&slrudir)?;
self.tasks
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
slrudir_key,
Bytes::from(slrudir_buf),
)));
for (segpath, segno, size) in segments {
// SlruSegBlocks for each segment
let p = path.join(&segpath);
let file_size = size;
ensure!(file_size % 8192 == 0);
let nblocks = u32::try_from(file_size / 8192)?;
let start_key = slru_block_to_key(kind, segno, 0);
let end_key = slru_block_to_key(kind, segno, nblocks);
debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment");
self.tasks
.push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new(
*self.timeline.get_shard_identity(),
start_key..end_key,
&p,
self.storage.clone(),
)));
// Followed by SlruSegSize
let segsize_key = slru_segment_size_to_key(kind, segno);
let segsize_buf = nblocks.to_le_bytes();
self.tasks
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
segsize_key,
Bytes::copy_from_slice(&segsize_buf),
)));
}
Ok(())
}
}
//
// dbdir iteration tools
//
struct PgDataDir {
pub dbs: Vec<PgDataDirDb>, // spcnode, dboid, path
}
struct PgDataDirDb {
pub spcnode: u32,
pub dboid: u32,
pub path: RemotePath,
pub files: Vec<PgDataDirDbFile>,
}
struct PgDataDirDbFile {
pub path: RemotePath,
pub rel_tag: RelTag,
pub segno: u32,
pub filesize: usize,
// Cummulative size of the given fork, set only for the last segment of that fork
pub nblocks: Option<usize>,
}
impl PgDataDir {
async fn new(storage: &RemoteStorageWrapper) -> anyhow::Result<Self> {
let datadir_path = storage.pgdata();
// Import ordinary databases, DEFAULTTABLESPACE_OID is smaller than GLOBALTABLESPACE_OID, so import them first
// Traverse database in increasing oid order
let basedir = &datadir_path.join("base");
let db_oids: Vec<_> = storage
.listdir(basedir)
.await?
.into_iter()
.filter_map(|path| path.object_name().and_then(|name| name.parse::<u32>().ok()))
.sorted()
.collect();
debug!(?db_oids, "found databases");
let mut databases = Vec::new();
for dboid in db_oids {
databases.push(
PgDataDirDb::new(
storage,
&basedir.join(dboid.to_string()),
pg_constants::DEFAULTTABLESPACE_OID,
dboid,
&datadir_path,
)
.await?,
);
}
// special case for global catalogs
databases.push(
PgDataDirDb::new(
storage,
&datadir_path.join("global"),
postgres_ffi::pg_constants::GLOBALTABLESPACE_OID,
0,
&datadir_path,
)
.await?,
);
databases.sort_by_key(|db| (db.spcnode, db.dboid));
Ok(Self { dbs: databases })
}
}
impl PgDataDirDb {
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%dboid, %db_path))]
async fn new(
storage: &RemoteStorageWrapper,
db_path: &RemotePath,
spcnode: u32,
dboid: u32,
datadir_path: &RemotePath,
) -> anyhow::Result<Self> {
let mut files: Vec<PgDataDirDbFile> = storage
.listfilesindir(db_path)
.await?
.into_iter()
.filter_map(|(path, size)| {
debug!(%path, %size, "found file in dbdir");
path.object_name().and_then(|name| {
// returns (relnode, forknum, segno)
parse_relfilename(name).ok().map(|x| (size, x))
})
})
.sorted_by_key(|(_, relfilename)| *relfilename)
.map(|(filesize, (relnode, forknum, segno))| {
let rel_tag = RelTag {
spcnode,
dbnode: dboid,
relnode,
forknum,
};
let path = datadir_path.join(rel_tag.to_segfile_name(segno));
assert!(filesize % BLCKSZ as usize == 0); // TODO: this should result in an error
let nblocks = filesize / BLCKSZ as usize;
PgDataDirDbFile {
path,
filesize,
rel_tag,
segno,
nblocks: Some(nblocks), // first non-cummulative sizes
}
})
.collect();
// Set cummulative sizes. Do all of that math here, so that later we could easier
// parallelize over segments and know with which segments we need to write relsize
// entry.
let mut cumulative_nblocks: usize = 0;
let mut prev_rel_tag: Option<RelTag> = None;
for i in 0..files.len() {
if prev_rel_tag == Some(files[i].rel_tag) {
cumulative_nblocks += files[i].nblocks.unwrap();
} else {
cumulative_nblocks = files[i].nblocks.unwrap();
}
files[i].nblocks = if i == files.len() - 1 || files[i + 1].rel_tag != files[i].rel_tag {
Some(cumulative_nblocks)
} else {
None
};
prev_rel_tag = Some(files[i].rel_tag);
}
Ok(PgDataDirDb {
files,
path: db_path.clone(),
spcnode,
dboid,
})
}
}
trait ImportTask {
fn key_range(&self) -> Range<Key>;
fn total_size(&self) -> usize {
// TODO: revisit this
if is_contiguous_range(&self.key_range()) {
contiguous_range_len(&self.key_range()) as usize * 8192
} else {
u32::MAX as usize
}
}
async fn doit(
self,
layer_writer: &mut ImageLayerWriter,
ctx: &RequestContext,
) -> anyhow::Result<usize>;
}
struct ImportSingleKeyTask {
key: Key,
buf: Bytes,
}
impl ImportSingleKeyTask {
fn new(key: Key, buf: Bytes) -> Self {
ImportSingleKeyTask { key, buf }
}
}
impl ImportTask for ImportSingleKeyTask {
fn key_range(&self) -> Range<Key> {
singleton_range(self.key)
}
async fn doit(
self,
layer_writer: &mut ImageLayerWriter,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
layer_writer.put_image(self.key, self.buf, ctx).await?;
Ok(1)
}
}
struct ImportRelBlocksTask {
shard_identity: ShardIdentity,
key_range: Range<Key>,
path: RemotePath,
storage: RemoteStorageWrapper,
}
impl ImportRelBlocksTask {
fn new(
shard_identity: ShardIdentity,
key_range: Range<Key>,
path: &RemotePath,
storage: RemoteStorageWrapper,
) -> Self {
ImportRelBlocksTask {
shard_identity,
key_range,
path: path.clone(),
storage,
}
}
}
impl ImportTask for ImportRelBlocksTask {
fn key_range(&self) -> Range<Key> {
self.key_range.clone()
}
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%self.path))]
async fn doit(
self,
layer_writer: &mut ImageLayerWriter,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
debug!("Importing relation file");
let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?;
let (rel_tag_end, end_blk) = self.key_range.end.to_rel_block()?;
assert_eq!(rel_tag, rel_tag_end);
let ranges = (start_blk..end_blk)
.enumerate()
.filter_map(|(i, blknum)| {
let key = rel_block_to_key(rel_tag, blknum);
if self.shard_identity.is_key_disposable(&key) {
return None;
}
let file_offset = i.checked_mul(8192).unwrap();
Some((
vec![key],
file_offset,
file_offset.checked_add(8192).unwrap(),
))
})
.coalesce(|(mut acc, acc_start, acc_end), (mut key, start, end)| {
assert_eq!(key.len(), 1);
assert!(!acc.is_empty());
assert!(acc_end > acc_start);
if acc_end == start /* TODO additional max range check here, to limit memory consumption per task to X */ {
acc.push(key.pop().unwrap());
Ok((acc, acc_start, end))
} else {
Err(((acc, acc_start, acc_end), (key, start, end)))
}
});
let mut nimages = 0;
for (keys, range_start, range_end) in ranges {
let range_buf = self
.storage
.get_range(&self.path, range_start.into_u64(), range_end.into_u64())
.await?;
let mut buf = Bytes::from(range_buf);
// TODO: batched writes
for key in keys {
let image = buf.split_to(8192);
layer_writer.put_image(key, image, ctx).await?;
nimages += 1;
}
}
Ok(nimages)
}
}
struct ImportSlruBlocksTask {
shard_identity: ShardIdentity,
key_range: Range<Key>,
path: RemotePath,
storage: RemoteStorageWrapper,
}
impl ImportSlruBlocksTask {
fn new(
shard_identity: ShardIdentity,
key_range: Range<Key>,
path: &RemotePath,
storage: RemoteStorageWrapper,
) -> Self {
ImportSlruBlocksTask {
shard_identity,
key_range,
path: path.clone(),
storage,
}
}
}
impl ImportTask for ImportSlruBlocksTask {
fn key_range(&self) -> Range<Key> {
self.key_range.clone()
}
async fn doit(
self,
layer_writer: &mut ImageLayerWriter,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
debug!("Importing SLRU segment file {}", self.path);
let buf = self.storage.get(&self.path).await?;
let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?;
let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?;
let mut blknum = start_blk;
let mut nimages = 0;
let mut file_offset = 0;
while blknum < end_blk {
let key = slru_block_to_key(kind, segno, blknum);
assert!(
!self.shard_identity.is_key_disposable(&key),
"SLRU keys need to go into every shard"
);
let buf = &buf[file_offset..(file_offset + 8192)];
file_offset += 8192;
layer_writer
.put_image(key, Bytes::copy_from_slice(buf), ctx)
.await?;
blknum += 1;
nimages += 1;
}
Ok(nimages)
}
}
enum AnyImportTask {
SingleKey(ImportSingleKeyTask),
RelBlocks(ImportRelBlocksTask),
SlruBlocks(ImportSlruBlocksTask),
}
impl ImportTask for AnyImportTask {
fn key_range(&self) -> Range<Key> {
match self {
Self::SingleKey(t) => t.key_range(),
Self::RelBlocks(t) => t.key_range(),
Self::SlruBlocks(t) => t.key_range(),
}
}
/// returns the number of images put into the `layer_writer`
async fn doit(
self,
layer_writer: &mut ImageLayerWriter,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
match self {
Self::SingleKey(t) => t.doit(layer_writer, ctx).await,
Self::RelBlocks(t) => t.doit(layer_writer, ctx).await,
Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await,
}
}
}
impl From<ImportSingleKeyTask> for AnyImportTask {
fn from(t: ImportSingleKeyTask) -> Self {
Self::SingleKey(t)
}
}
impl From<ImportRelBlocksTask> for AnyImportTask {
fn from(t: ImportRelBlocksTask) -> Self {
Self::RelBlocks(t)
}
}
impl From<ImportSlruBlocksTask> for AnyImportTask {
fn from(t: ImportSlruBlocksTask) -> Self {
Self::SlruBlocks(t)
}
}
struct ChunkProcessingJob {
timeline: Arc<Timeline>,
range: Range<Key>,
tasks: Vec<AnyImportTask>,
pgdata_lsn: Lsn,
}
impl ChunkProcessingJob {
fn new(range: Range<Key>, tasks: Vec<AnyImportTask>, env: &Flow) -> Self {
assert!(env.pgdata_lsn.is_valid());
Self {
timeline: env.timeline.clone(),
range,
tasks,
pgdata_lsn: env.pgdata_lsn,
}
}
async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> {
let mut writer = ImageLayerWriter::new(
self.timeline.conf,
self.timeline.timeline_id,
self.timeline.tenant_shard_id,
&self.range,
self.pgdata_lsn,
ctx,
)
.await?;
let mut nimages = 0;
for task in self.tasks {
nimages += task.doit(&mut writer, ctx).await?;
}
let resident_layer = if nimages > 0 {
let (desc, path) = writer.finish(ctx).await?;
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?
} else {
// dropping the writer cleans up
return Ok(());
};
// this is sharing the same code as create_image_layers
let mut guard = self.timeline.layers.write().await;
guard
.open_mut()?
.track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics);
crate::tenant::timeline::drop_wlock(guard);
// Schedule the layer for upload but don't add barriers such as
// wait for completion or index upload, so we don't inhibit upload parallelism.
// TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?)
// TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level.
self.timeline
.remote_client
.schedule_layer_file_upload(resident_layer)?;
Ok(())
}
}

View File

@@ -0,0 +1,315 @@
use std::{ops::Bound, sync::Arc};
use anyhow::Context;
use bytes::Bytes;
use postgres_ffi::ControlFileData;
use remote_storage::{
Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath,
};
use serde::de::DeserializeOwned;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, instrument};
use utils::lsn::Lsn;
use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf};
use super::{importbucket_format, index_part_format};
pub async fn new(
conf: &'static PageServerConf,
location: &index_part_format::Location,
cancel: CancellationToken,
) -> Result<RemoteStorageWrapper, anyhow::Error> {
// FIXME: we probably want some timeout, and we might be able to assume the max file
// size on S3 is 1GiB (postgres segment size). But the problem is that the individual
// downloaders don't know enough about concurrent downloads to make a guess on the
// expected bandwidth and resulting best timeout.
let timeout = std::time::Duration::from_secs(24 * 60 * 60);
let location_storage = match location {
#[cfg(feature = "testing")]
index_part_format::Location::LocalFs { path } => {
GenericRemoteStorage::LocalFs(remote_storage::LocalFs::new(path.clone(), timeout)?)
}
index_part_format::Location::AwsS3 {
region,
bucket,
key,
} => {
// TODO: think about security implications of letting the client specify the bucket & prefix.
// It's the most flexible right now, but, possibly we want to move bucket name into PS conf
// and force the timeline_id into the prefix?
GenericRemoteStorage::AwsS3(Arc::new(
remote_storage::S3Bucket::new(
&remote_storage::S3Config {
bucket_name: bucket.clone(),
prefix_in_bucket: Some(key.clone()),
bucket_region: region.clone(),
endpoint: conf
.import_pgdata_aws_endpoint_url
.clone()
.map(|url| url.to_string()), // by specifying None here, remote_storage/aws-sdk-rust will infer from env
concurrency_limit: 100.try_into().unwrap(), // TODO: think about this
max_keys_per_list_response: Some(1000), // TODO: think about this
upload_storage_class: None, // irrelevant
},
timeout,
)
.await
.context("setup s3 bucket")?,
))
}
};
let storage_wrapper = RemoteStorageWrapper::new(location_storage, cancel);
Ok(storage_wrapper)
}
/// Wrap [`remote_storage`] APIs to make it look a bit more like a filesystem API
/// such as [`tokio::fs`], which was used in the original implementation of the import code.
#[derive(Clone)]
pub struct RemoteStorageWrapper {
storage: GenericRemoteStorage,
cancel: CancellationToken,
}
impl RemoteStorageWrapper {
pub fn new(storage: GenericRemoteStorage, cancel: CancellationToken) -> Self {
Self { storage, cancel }
}
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
pub async fn listfilesindir(
&self,
path: &RemotePath,
) -> Result<Vec<(RemotePath, usize)>, DownloadError> {
assert!(
path.object_name().is_some(),
"must specify dirname, without trailing slash"
);
let path = path.add_trailing_slash();
let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
|| async {
let Listing { keys, prefixes: _ } = self
.storage
.list(
Some(&path),
remote_storage::ListingMode::WithDelimiter,
None,
&self.cancel,
)
.await?;
let res = keys
.into_iter()
.map(|ListingObject { key, size, .. }| (key, size.into_usize()))
.collect();
Ok(res)
},
&format!("listfilesindir {path:?}"),
&self.cancel,
)
.await;
debug!(?res, "returning");
res
}
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
pub async fn listdir(&self, path: &RemotePath) -> Result<Vec<RemotePath>, DownloadError> {
assert!(
path.object_name().is_some(),
"must specify dirname, without trailing slash"
);
let path = path.add_trailing_slash();
let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
|| async {
let Listing { keys, prefixes } = self
.storage
.list(
Some(&path),
remote_storage::ListingMode::WithDelimiter,
None,
&self.cancel,
)
.await?;
let res = keys
.into_iter()
.map(|ListingObject { key, .. }| key)
.chain(prefixes.into_iter())
.collect();
Ok(res)
},
&format!("listdir {path:?}"),
&self.cancel,
)
.await;
debug!(?res, "returning");
res
}
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
pub async fn get(&self, path: &RemotePath) -> Result<Bytes, DownloadError> {
let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
|| async {
let Download {
download_stream, ..
} = self
.storage
.download(path, &DownloadOpts::default(), &self.cancel)
.await?;
let mut reader = tokio_util::io::StreamReader::new(download_stream);
// XXX optimize this, can we get the capacity hint from somewhere?
let mut buf = Vec::new();
tokio::io::copy_buf(&mut reader, &mut buf).await?;
Ok(Bytes::from(buf))
},
&format!("download {path:?}"),
&self.cancel,
)
.await;
debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done");
res
}
pub async fn get_spec(&self) -> Result<Option<importbucket_format::Spec>, anyhow::Error> {
self.get_json(&RemotePath::from_string("spec.json").unwrap())
.await
.context("get spec")
}
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
pub async fn get_json<T: DeserializeOwned>(
&self,
path: &RemotePath,
) -> Result<Option<T>, DownloadError> {
let buf = match self.get(path).await {
Ok(buf) => buf,
Err(DownloadError::NotFound) => return Ok(None),
Err(err) => return Err(err),
};
let res = serde_json::from_slice(&buf)
.context("serialize")
// TODO: own error type
.map_err(DownloadError::Other)?;
Ok(Some(res))
}
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
pub async fn put_json<T>(&self, path: &RemotePath, value: &T) -> anyhow::Result<()>
where
T: serde::Serialize,
{
let buf = serde_json::to_vec(value)?;
let bytes = Bytes::from(buf);
utils::backoff::retry(
|| async {
let size = bytes.len();
let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
self.storage
.upload_storage_object(bytes, size, path, &self.cancel)
.await
},
remote_storage::TimeoutOrCancel::caused_by_cancel,
1,
u32::MAX,
&format!("put json {path}"),
&self.cancel,
)
.await
.expect("practically infinite retries")
}
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
pub async fn get_range(
&self,
path: &RemotePath,
start_inclusive: u64,
end_exclusive: u64,
) -> Result<Vec<u8>, DownloadError> {
let len = end_exclusive
.checked_sub(start_inclusive)
.unwrap()
.into_usize();
let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
|| async {
let Download {
download_stream, ..
} = self
.storage
.download(
path,
&DownloadOpts {
etag: None,
byte_start: Bound::Included(start_inclusive),
byte_end: Bound::Excluded(end_exclusive)
},
&self.cancel)
.await?;
let mut reader = tokio_util::io::StreamReader::new(download_stream);
let mut buf = Vec::with_capacity(len);
tokio::io::copy_buf(&mut reader, &mut buf).await?;
Ok(buf)
},
&format!("download range len=0x{len:x} [0x{start_inclusive:x},0x{end_exclusive:x}) from {path:?}"),
&self.cancel,
)
.await;
debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done");
res
}
pub fn pgdata(&self) -> RemotePath {
RemotePath::from_string("pgdata").unwrap()
}
pub async fn get_control_file(&self) -> Result<ControlFile, anyhow::Error> {
let control_file_path = self.pgdata().join("global/pg_control");
info!("get control file from {control_file_path}");
let control_file_buf = self.get(&control_file_path).await?;
ControlFile::new(control_file_buf)
}
}
pub struct ControlFile {
control_file_data: ControlFileData,
control_file_buf: Bytes,
}
impl ControlFile {
pub(crate) fn new(control_file_buf: Bytes) -> Result<Self, anyhow::Error> {
// XXX ControlFileData is version-specific, we're always using v14 here. v17 had changes.
let control_file_data = ControlFileData::decode(&control_file_buf)?;
let control_file = ControlFile {
control_file_data,
control_file_buf,
};
control_file.try_pg_version()?; // so that we can offer infallible pg_version()
Ok(control_file)
}
pub(crate) fn base_lsn(&self) -> Lsn {
Lsn(self.control_file_data.checkPoint).align()
}
pub(crate) fn pg_version(&self) -> u32 {
self.try_pg_version()
.expect("prepare() checks that try_pg_version doesn't error")
}
pub(crate) fn control_file_data(&self) -> &ControlFileData {
&self.control_file_data
}
pub(crate) fn control_file_buf(&self) -> &Bytes {
&self.control_file_buf
}
fn try_pg_version(&self) -> anyhow::Result<u32> {
Ok(match self.control_file_data.catalog_version_no {
// thesea are from catversion.h
202107181 => 14,
202209061 => 15,
202307071 => 16,
/* XXX pg17 */
catversion => {
anyhow::bail!("unrecognized catalog version {catversion}")
}
})
}
}

View File

@@ -0,0 +1,20 @@
use serde::{Deserialize, Serialize};
#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
pub struct PgdataStatus {
pub done: bool,
// TODO: remaining fields
}
#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
pub struct ShardStatus {
pub done: bool,
// TODO: remaining fields
}
// TODO: dedupe with fast_import code
#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
pub struct Spec {
pub project_id: String,
pub branch_id: String,
}

View File

@@ -0,0 +1,68 @@
use serde::{Deserialize, Serialize};
#[cfg(feature = "testing")]
use camino::Utf8PathBuf;
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub enum Root {
V1(V1),
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub enum V1 {
InProgress(InProgress),
Done(Done),
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
#[serde(transparent)]
pub struct IdempotencyKey(String);
impl IdempotencyKey {
pub fn new(s: String) -> Self {
Self(s)
}
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub struct InProgress {
pub idempotency_key: IdempotencyKey,
pub location: Location,
pub started_at: chrono::NaiveDateTime,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub struct Done {
pub idempotency_key: IdempotencyKey,
pub started_at: chrono::NaiveDateTime,
pub finished_at: chrono::NaiveDateTime,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub enum Location {
#[cfg(feature = "testing")]
LocalFs { path: Utf8PathBuf },
AwsS3 {
region: String,
bucket: String,
key: String,
},
}
impl Root {
pub fn is_done(&self) -> bool {
match self {
Root::V1(v1) => match v1 {
V1::Done(_) => true,
V1::InProgress(_) => false,
},
}
}
pub fn idempotency_key(&self) -> &IdempotencyKey {
match self {
Root::V1(v1) => match v1 {
V1::InProgress(in_progress) => &in_progress.idempotency_key,
V1::Done(done) => &done.idempotency_key,
},
}
}
}

View File

@@ -0,0 +1,119 @@
//! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate.
use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt};
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use tracing::error;
use crate::config::PageServerConf;
use reqwest::Method;
use super::importbucket_format::Spec;
pub struct Client {
base_url: String,
authorization_header: Option<String>,
client: reqwest::Client,
cancel: CancellationToken,
}
pub type Result<T> = std::result::Result<T, Error>;
#[derive(Serialize, Deserialize, Debug)]
struct ImportProgressRequest {
// no fields yet, not sure if there every will be any
}
#[derive(Serialize, Deserialize, Debug)]
struct ImportProgressResponse {
// we don't care
}
impl Client {
pub fn new(conf: &PageServerConf, cancel: CancellationToken) -> anyhow::Result<Self> {
let Some(ref base_url) = conf.import_pgdata_upcall_api else {
anyhow::bail!("import_pgdata_upcall_api is not configured")
};
Ok(Self {
base_url: base_url.to_string(),
client: reqwest::Client::new(),
cancel,
authorization_header: conf
.import_pgdata_upcall_api_token
.as_ref()
.map(|secret_string| secret_string.get_contents())
.map(|jwt| format!("Bearer {jwt}")),
})
}
fn start_request<U: reqwest::IntoUrl>(
&self,
method: Method,
uri: U,
) -> reqwest::RequestBuilder {
let req = self.client.request(method, uri);
if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
}
}
async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
&self,
method: Method,
uri: U,
body: B,
) -> Result<reqwest::Response> {
self.start_request(method, uri)
.json(&body)
.send()
.await
.map_err(Error::ReceiveBody)
}
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
&self,
method: Method,
uri: U,
body: B,
) -> Result<reqwest::Response> {
let res = self.request_noerror(method, uri, body).await?;
let response = res.error_from_body().await?;
Ok(response)
}
pub async fn send_progress_once(&self, spec: &Spec) -> Result<()> {
let url = format!(
"{}/projects/{}/branches/{}/import_progress",
self.base_url, spec.project_id, spec.branch_id
);
let ImportProgressResponse {} = self
.request(Method::POST, url, &ImportProgressRequest {})
.await?
.json()
.await
.map_err(Error::ReceiveBody)?;
Ok(())
}
pub async fn send_progress_until_success(&self, spec: &Spec) -> anyhow::Result<()> {
loop {
match self.send_progress_once(spec).await {
Ok(()) => return Ok(()),
Err(Error::Cancelled) => return Err(anyhow::anyhow!("cancelled")),
Err(err) => {
error!(?err, "error sending progress, retrying");
if tokio::time::timeout(
std::time::Duration::from_secs(10),
self.cancel.cancelled(),
)
.await
.is_ok()
{
anyhow::bail!("cancelled while sending early progress update");
}
}
}
}
}
}

View File

@@ -58,7 +58,7 @@ pub(crate) async fn offload_timeline(
}
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
timeline.shutdown(super::ShutdownMode::Flush).await;
timeline.shutdown(super::ShutdownMode::Reload).await;
// TODO extend guard mechanism above with method
// to make deletions possible while offloading is in progress

View File

@@ -3,7 +3,7 @@ use std::{collections::hash_map::Entry, fs, sync::Arc};
use anyhow::Context;
use camino::Utf8PathBuf;
use tracing::{error, info, info_span};
use utils::{fs_ext, id::TimelineId, lsn::Lsn};
use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};
use crate::{
context::RequestContext,
@@ -23,14 +23,14 @@ use super::Timeline;
pub struct UninitializedTimeline<'t> {
pub(crate) owning_tenant: &'t Tenant,
timeline_id: TimelineId,
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
}
impl<'t> UninitializedTimeline<'t> {
pub(crate) fn new(
owning_tenant: &'t Tenant,
timeline_id: TimelineId,
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
) -> Self {
Self {
owning_tenant,
@@ -87,6 +87,10 @@ impl<'t> UninitializedTimeline<'t> {
}
}
pub(crate) fn finish_creation_myself(&mut self) -> (Arc<Timeline>, TimelineCreateGuard) {
self.raw_timeline.take().expect("already checked")
}
/// Prepares timeline data by loading it from the basebackup archive.
pub(crate) async fn import_basebackup_from_tar(
self,
@@ -167,9 +171,10 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
/// A guard for timeline creations in process: as long as this object exists, the timeline ID
/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
#[must_use]
pub(crate) struct TimelineCreateGuard<'t> {
owning_tenant: &'t Tenant,
timeline_id: TimelineId,
pub(crate) struct TimelineCreateGuard {
pub(crate) _tenant_gate_guard: GateGuard,
pub(crate) owning_tenant: Arc<Tenant>,
pub(crate) timeline_id: TimelineId,
pub(crate) timeline_path: Utf8PathBuf,
pub(crate) idempotency: CreateTimelineIdempotency,
}
@@ -184,20 +189,27 @@ pub(crate) enum TimelineExclusionError {
},
#[error("Already creating")]
AlreadyCreating,
#[error("Shutting down")]
ShuttingDown,
// e.g. I/O errors, or some failure deep in postgres initdb
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl<'t> TimelineCreateGuard<'t> {
impl TimelineCreateGuard {
pub(crate) fn new(
owning_tenant: &'t Tenant,
owning_tenant: &Arc<Tenant>,
timeline_id: TimelineId,
timeline_path: Utf8PathBuf,
idempotency: CreateTimelineIdempotency,
allow_offloaded: bool,
) -> Result<Self, TimelineExclusionError> {
let _tenant_gate_guard = owning_tenant
.gate
.enter()
.map_err(|_| TimelineExclusionError::ShuttingDown)?;
// Lock order: this is the only place we take both locks. During drop() we only
// lock creating_timelines
let timelines = owning_tenant.timelines.lock().unwrap();
@@ -225,8 +237,12 @@ impl<'t> TimelineCreateGuard<'t> {
return Err(TimelineExclusionError::AlreadyCreating);
}
creating_timelines.insert(timeline_id);
drop(creating_timelines);
drop(timelines_offloaded);
drop(timelines);
Ok(Self {
owning_tenant,
_tenant_gate_guard,
owning_tenant: Arc::clone(owning_tenant),
timeline_id,
timeline_path,
idempotency,
@@ -234,7 +250,7 @@ impl<'t> TimelineCreateGuard<'t> {
}
}
impl Drop for TimelineCreateGuard<'_> {
impl Drop for TimelineCreateGuard {
fn drop(&mut self) {
self.owning_tenant
.timelines_creating

View File

@@ -38,6 +38,7 @@ use storage_broker::BrokerClientChannel;
use tokio::sync::watch;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::postgres_client::PostgresClientProtocol;
use self::connection_manager::ConnectionManagerStatus;
@@ -45,6 +46,7 @@ use super::Timeline;
#[derive(Clone)]
pub struct WalReceiverConf {
pub protocol: PostgresClientProtocol,
/// The timeout on the connection to safekeeper for WAL streaming.
pub wal_connect_timeout: Duration,
/// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.

View File

@@ -36,7 +36,9 @@ use postgres_connection::PgConnectionConfig;
use utils::backoff::{
exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
use utils::postgres_client::wal_stream_connection_config;
use utils::postgres_client::{
wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol,
};
use utils::{
id::{NodeId, TenantTimelineId},
lsn::Lsn,
@@ -533,6 +535,7 @@ impl ConnectionManagerState {
let node_id = new_sk.safekeeper_id;
let connect_timeout = self.conf.wal_connect_timeout;
let ingest_batch_size = self.conf.ingest_batch_size;
let protocol = self.conf.protocol;
let timeline = Arc::clone(&self.timeline);
let ctx = ctx.detached_child(
TaskKind::WalReceiverConnectionHandler,
@@ -546,6 +549,7 @@ impl ConnectionManagerState {
let res = super::walreceiver_connection::handle_walreceiver_connection(
timeline,
protocol,
new_sk.wal_source_connconf,
events_sender,
cancellation.clone(),
@@ -984,15 +988,33 @@ impl ConnectionManagerState {
if info.safekeeper_connstr.is_empty() {
return None; // no connection string, ignore sk
}
match wal_stream_connection_config(
self.id,
info.safekeeper_connstr.as_ref(),
match &self.conf.auth_token {
None => None,
Some(x) => Some(x),
let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol {
PostgresClientProtocol::Vanilla => {
(None, None, None)
},
self.conf.availability_zone.as_deref(),
) {
PostgresClientProtocol::Interpreted { .. } => {
let shard_identity = self.timeline.get_shard_identity();
(
Some(shard_identity.number.0),
Some(shard_identity.count.0),
Some(shard_identity.stripe_size.0),
)
}
};
let connection_conf_args = ConnectionConfigArgs {
protocol: self.conf.protocol,
ttid: self.id,
shard_number,
shard_count,
shard_stripe_size,
listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
availability_zone: self.conf.availability_zone.as_deref()
};
match wal_stream_connection_config(connection_conf_args) {
Ok(connstr) => Some((*sk_id, info, connstr)),
Err(e) => {
error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id);
@@ -1096,6 +1118,7 @@ impl ReconnectReason {
mod tests {
use super::*;
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
use url::Host;
fn dummy_broker_sk_timeline(
@@ -1532,6 +1555,7 @@ mod tests {
timeline,
cancel: CancellationToken::new(),
conf: WalReceiverConf {
protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
wal_connect_timeout: Duration::from_secs(1),
lagging_wal_timeout: Duration::from_secs(1),
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),

View File

@@ -22,7 +22,10 @@ use tokio::{select, sync::watch, time};
use tokio_postgres::{replication::ReplicationStream, Client};
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, trace, warn, Instrument};
use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord};
use wal_decoder::{
models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords},
wire_format::FromWireFormat,
};
use super::TaskStateUpdate;
use crate::{
@@ -36,7 +39,7 @@ use crate::{
use postgres_backend::is_expected_io_error;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::waldecoder::WalStreamDecoder;
use utils::{id::NodeId, lsn::Lsn};
use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
/// Status of the connection.
@@ -109,6 +112,7 @@ impl From<WalDecodeError> for WalReceiverError {
#[allow(clippy::too_many_arguments)]
pub(super) async fn handle_walreceiver_connection(
timeline: Arc<Timeline>,
protocol: PostgresClientProtocol,
wal_source_connconf: PgConnectionConfig,
events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
cancellation: CancellationToken,
@@ -260,6 +264,14 @@ pub(super) async fn handle_walreceiver_connection(
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
let interpreted_proto_config = match protocol {
PostgresClientProtocol::Vanilla => None,
PostgresClientProtocol::Interpreted {
format,
compression,
} => Some((format, compression)),
};
while let Some(replication_message) = {
select! {
_ = cancellation.cancelled() => {
@@ -291,6 +303,15 @@ pub(super) async fn handle_walreceiver_connection(
connection_status.latest_connection_update = now;
connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
}
ReplicationMessage::RawInterpretedWalRecords(raw) => {
connection_status.latest_connection_update = now;
if !raw.data().is_empty() {
connection_status.latest_wal_update = now;
}
connection_status.commit_lsn = Some(Lsn::from(raw.commit_lsn()));
connection_status.streaming_lsn = Some(Lsn::from(raw.streaming_lsn()));
}
&_ => {}
};
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
@@ -298,7 +319,144 @@ pub(super) async fn handle_walreceiver_connection(
return Ok(());
}
async fn commit(
modification: &mut DatadirModification<'_>,
uncommitted: &mut u64,
filtered: &mut u64,
ctx: &RequestContext,
) -> anyhow::Result<()> {
WAL_INGEST
.records_committed
.inc_by(*uncommitted - *filtered);
modification.commit(ctx).await?;
*uncommitted = 0;
*filtered = 0;
Ok(())
}
let status_update = match replication_message {
ReplicationMessage::RawInterpretedWalRecords(raw) => {
WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
let mut uncommitted_records = 0;
let mut filtered_records = 0;
// This is the end LSN of the raw WAL from which the records
// were interpreted.
let streaming_lsn = Lsn::from(raw.streaming_lsn());
let (format, compression) = interpreted_proto_config.unwrap();
let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression)
.await
.with_context(|| {
anyhow::anyhow!(
"Failed to deserialize interpreted records ending at LSN {streaming_lsn}"
)
})?;
let InterpretedWalRecords {
records,
next_record_lsn,
} = batch;
tracing::debug!(
"Received WAL up to {} with next_record_lsn={:?}",
streaming_lsn,
next_record_lsn
);
// We start the modification at 0 because each interpreted record
// advances it to its end LSN. 0 is just an initialization placeholder.
let mut modification = timeline.begin_modification(Lsn(0));
for interpreted in records {
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
&& uncommitted_records > 0
{
commit(
&mut modification,
&mut uncommitted_records,
&mut filtered_records,
&ctx,
)
.await?;
}
let local_next_record_lsn = interpreted.next_record_lsn;
let ingested = walingest
.ingest_record(interpreted, &mut modification, &ctx)
.await
.with_context(|| {
format!("could not ingest record at {local_next_record_lsn}")
})?;
if !ingested {
tracing::debug!(
"ingest: filtered out record @ LSN {local_next_record_lsn}"
);
WAL_INGEST.records_filtered.inc();
filtered_records += 1;
}
uncommitted_records += 1;
// FIXME: this cannot be made pausable_failpoint without fixing the
// failpoint library; in tests, the added amount of debugging will cause us
// to timeout the tests.
fail_point!("walreceiver-after-ingest");
// Commit every ingest_batch_size records. Even if we filtered out
// all records, we still need to call commit to advance the LSN.
if uncommitted_records >= ingest_batch_size
|| modification.approx_pending_bytes()
> DatadirModification::MAX_PENDING_BYTES
{
commit(
&mut modification,
&mut uncommitted_records,
&mut filtered_records,
&ctx,
)
.await?;
}
}
// Records might have been filtered out on the safekeeper side, but we still
// need to advance last record LSN on all shards. If we've not ingested the latest
// record, then set the LSN of the modification past it. This way all shards
// advance their last record LSN at the same time.
let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
Some(lsn) if lsn > modification.get_lsn() => {
modification.set_lsn(lsn).unwrap();
true
}
_ => false,
};
if uncommitted_records > 0 || needs_last_record_lsn_advance {
// Commit any uncommitted records
commit(
&mut modification,
&mut uncommitted_records,
&mut filtered_records,
&ctx,
)
.await?;
}
if !caught_up && streaming_lsn >= end_of_wal {
info!("caught up at LSN {streaming_lsn}");
caught_up = true;
}
tracing::debug!(
"Ingested WAL up to {streaming_lsn}. Last record LSN is {}",
timeline.get_last_record_lsn()
);
Some(streaming_lsn)
}
ReplicationMessage::XLogData(xlog_data) => {
// Pass the WAL data to the decoder, and see if we can decode
// more records as a result.
@@ -316,21 +474,6 @@ pub(super) async fn handle_walreceiver_connection(
let mut uncommitted_records = 0;
let mut filtered_records = 0;
async fn commit(
modification: &mut DatadirModification<'_>,
uncommitted: &mut u64,
filtered: &mut u64,
ctx: &RequestContext,
) -> anyhow::Result<()> {
WAL_INGEST
.records_committed
.inc_by(*uncommitted - *filtered);
modification.commit(ctx).await?;
*uncommitted = 0;
*filtered = 0;
Ok(())
}
while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are

View File

@@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use std::collections::HashSet;
use std::collections::{HashMap, VecDeque};
use std::fmt::Debug;
@@ -14,7 +15,6 @@ use utils::lsn::AtomicLsn;
use std::sync::atomic::AtomicU32;
use utils::lsn::Lsn;
#[cfg(feature = "testing")]
use utils::generation::Generation;
// clippy warns that Uninitialized is much smaller than Initialized, which wastes
@@ -38,6 +38,12 @@ impl UploadQueue {
}
}
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
pub(crate) enum OpType {
MayReorder,
FlushDeletion,
}
/// This keeps track of queued and in-progress tasks.
pub(crate) struct UploadQueueInitialized {
/// Counter to assign task IDs
@@ -88,6 +94,9 @@ pub(crate) struct UploadQueueInitialized {
#[cfg(feature = "testing")]
pub(crate) dangling_files: HashMap<LayerName, Generation>,
/// Ensure we order file operations correctly.
pub(crate) recently_deleted: HashSet<(LayerName, Generation)>,
/// Deletions that are blocked by the tenant configuration
pub(crate) blocked_deletions: Vec<Delete>,
@@ -183,6 +192,7 @@ impl UploadQueue {
queued_operations: VecDeque::new(),
#[cfg(feature = "testing")]
dangling_files: HashMap::new(),
recently_deleted: HashSet::new(),
blocked_deletions: Vec::new(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
@@ -224,6 +234,7 @@ impl UploadQueue {
queued_operations: VecDeque::new(),
#[cfg(feature = "testing")]
dangling_files: HashMap::new(),
recently_deleted: HashSet::new(),
blocked_deletions: Vec::new(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
@@ -282,8 +293,8 @@ pub(crate) struct Delete {
#[derive(Debug)]
pub(crate) enum UploadOp {
/// Upload a layer file
UploadLayer(ResidentLayer, LayerFileMetadata),
/// Upload a layer file. The last field indicates the last operation for thie file.
UploadLayer(ResidentLayer, LayerFileMetadata, Option<OpType>),
/// Upload a index_part.json file
UploadMetadata {
@@ -305,11 +316,11 @@ pub(crate) enum UploadOp {
impl std::fmt::Display for UploadOp {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
UploadOp::UploadLayer(layer, metadata) => {
UploadOp::UploadLayer(layer, metadata, mode) => {
write!(
f,
"UploadLayer({}, size={:?}, gen={:?})",
layer, metadata.file_size, metadata.generation
"UploadLayer({}, size={:?}, gen={:?}, mode={:?})",
layer, metadata.file_size, metadata.generation, mode
)
}
UploadOp::UploadMetadata { uploaded, .. } => {

View File

@@ -19,7 +19,7 @@ impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign<A>> {
}
}
impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
impl<const N: usize, A: Alignment> Deref for AlignedSlice<'_, N, A> {
type Target = [u8; N];
fn deref(&self) -> &Self::Target {
@@ -27,13 +27,13 @@ impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
}
}
impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> {
impl<const N: usize, A: Alignment> DerefMut for AlignedSlice<'_, N, A> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.buf
}
}
impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> {
impl<const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'_, N, A> {
fn as_ref(&self) -> &[u8; N] {
self.buf
}

View File

@@ -334,14 +334,32 @@ impl WalIngest {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
//
// TODO: analyze the metrics and tighten this up accordingly. This logic
// implicitly assumes that VM pages see explicit WAL writes before
// implicit ClearVmBits, and will otherwise silently drop updates.
let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["relation"])
.inc();
return Ok(());
};
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["new_page"])
.inc();
new_vm_blk = None;
}
}
if let Some(blknum) = old_vm_blk {
if blknum >= vm_size {
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["old_page"])
.inc();
old_vm_blk = None;
}
}
@@ -572,7 +590,8 @@ impl WalIngest {
modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
fsm_physical_page_no += 1;
}
let nblocks = get_relsize(modification, rel, ctx).await?;
// TODO: re-examine the None case here wrt. sharding; should we error?
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
if nblocks > fsm_physical_page_no {
// check if something to do: FSM is larger than truncate position
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -612,7 +631,8 @@ impl WalIngest {
)?;
vm_page_no += 1;
}
let nblocks = get_relsize(modification, rel, ctx).await?;
// TODO: re-examine the None case here wrt. sharding; should we error?
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
if nblocks > vm_page_no {
// check if something to do: VM is larger than truncate position
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1430,24 +1450,27 @@ impl WalIngest {
}
}
/// Returns the size of the relation as of this modification, or None if the relation doesn't exist.
///
/// This is only accurate on shard 0. On other shards, it will return the size up to the highest
/// page number stored in the shard, or None if the shard does not have any pages for it.
async fn get_relsize(
modification: &DatadirModification<'_>,
rel: RelTag,
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
let nblocks = if !modification
) -> Result<Option<BlockNumber>, PageReconstructError> {
if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), ctx)
.await?
{
0
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), ctx)
.await?
};
Ok(nblocks)
return Ok(None);
}
modification
.tline
.get_rel_size(rel, Version::Modified(modification), ctx)
.await
.map(Some)
}
#[allow(clippy::bool_assert_comparison)]

View File

@@ -20,7 +20,7 @@
#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
static int logical_replication_max_snap_files = 300;
static int logical_replication_max_snap_files = 10000;
/*
* According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of
@@ -184,7 +184,7 @@ InitLogicalReplicationMonitor(void)
"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
NULL,
&logical_replication_max_snap_files,
300, -1, INT_MAX,
10000, -1, INT_MAX,
PGC_SIGHUP,
0,
NULL, NULL, NULL);

159
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -114,7 +114,6 @@ files = [
[package.dependencies]
aiohappyeyeballs = ">=2.3.0"
aiosignal = ">=1.1.2"
async-timeout = {version = ">=4.0,<6.0", markers = "python_version < \"3.11\""}
attrs = ">=17.3.0"
frozenlist = ">=1.1.1"
multidict = ">=4.5,<7.0"
@@ -219,10 +218,8 @@ files = [
]
[package.dependencies]
exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
idna = ">=2.8"
sniffio = ">=1.1"
typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
[package.extras]
doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
@@ -737,10 +734,7 @@ files = [
[package.dependencies]
jmespath = ">=0.7.1,<2.0.0"
python-dateutil = ">=2.1,<3.0.0"
urllib3 = [
{version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
{version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
]
urllib3 = {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}
[package.extras]
crt = ["awscrt (==0.19.19)"]
@@ -1069,20 +1063,6 @@ docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
ssh = ["paramiko (>=2.4.3)"]
websockets = ["websocket-client (>=1.3.0)"]
[[package]]
name = "exceptiongroup"
version = "1.1.1"
description = "Backport of PEP 654 (exception groups)"
optional = false
python-versions = ">=3.7"
files = [
{file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"},
{file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"},
]
[package.extras]
test = ["pytest (>=6)"]
[[package]]
name = "execnet"
version = "1.9.0"
@@ -1110,7 +1090,6 @@ files = [
[package.dependencies]
click = ">=8.0"
importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""}
itsdangerous = ">=2.0"
Jinja2 = ">=3.0"
Werkzeug = ">=2.2.2"
@@ -1319,25 +1298,6 @@ files = [
{file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
]
[[package]]
name = "importlib-metadata"
version = "4.12.0"
description = "Read metadata from Python packages"
optional = false
python-versions = ">=3.7"
files = [
{file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"},
{file = "importlib_metadata-4.12.0.tar.gz", hash = "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670"},
]
[package.dependencies]
zipp = ">=0.5"
[package.extras]
docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"]
perf = ["ipython"]
testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"]
[[package]]
name = "iniconfig"
version = "1.1.1"
@@ -1898,48 +1858,54 @@ files = [
[[package]]
name = "mypy"
version = "1.3.0"
version = "1.13.0"
description = "Optional static typing for Python"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
files = [
{file = "mypy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1eb485cea53f4f5284e5baf92902cd0088b24984f4209e25981cc359d64448d"},
{file = "mypy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c99c3ecf223cf2952638da9cd82793d8f3c0c5fa8b6ae2b2d9ed1e1ff51ba85"},
{file = "mypy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:550a8b3a19bb6589679a7c3c31f64312e7ff482a816c96e0cecec9ad3a7564dd"},
{file = "mypy-1.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cbc07246253b9e3d7d74c9ff948cd0fd7a71afcc2b77c7f0a59c26e9395cb152"},
{file = "mypy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:a22435632710a4fcf8acf86cbd0d69f68ac389a3892cb23fbad176d1cddaf228"},
{file = "mypy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6e33bb8b2613614a33dff70565f4c803f889ebd2f859466e42b46e1df76018dd"},
{file = "mypy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d23370d2a6b7a71dc65d1266f9a34e4cde9e8e21511322415db4b26f46f6b8c"},
{file = "mypy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:658fe7b674769a0770d4b26cb4d6f005e88a442fe82446f020be8e5f5efb2fae"},
{file = "mypy-1.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e42d29e324cdda61daaec2336c42512e59c7c375340bd202efa1fe0f7b8f8ca"},
{file = "mypy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:d0b6c62206e04061e27009481cb0ec966f7d6172b5b936f3ead3d74f29fe3dcf"},
{file = "mypy-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:76ec771e2342f1b558c36d49900dfe81d140361dd0d2df6cd71b3db1be155409"},
{file = "mypy-1.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc95f8386314272bbc817026f8ce8f4f0d2ef7ae44f947c4664efac9adec929"},
{file = "mypy-1.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:faff86aa10c1aa4a10e1a301de160f3d8fc8703b88c7e98de46b531ff1276a9a"},
{file = "mypy-1.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8c5979d0deb27e0f4479bee18ea0f83732a893e81b78e62e2dda3e7e518c92ee"},
{file = "mypy-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c5d2cc54175bab47011b09688b418db71403aefad07cbcd62d44010543fc143f"},
{file = "mypy-1.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:87df44954c31d86df96c8bd6e80dfcd773473e877ac6176a8e29898bfb3501cb"},
{file = "mypy-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473117e310febe632ddf10e745a355714e771ffe534f06db40702775056614c4"},
{file = "mypy-1.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:74bc9b6e0e79808bf8678d7678b2ae3736ea72d56eede3820bd3849823e7f305"},
{file = "mypy-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:44797d031a41516fcf5cbfa652265bb994e53e51994c1bd649ffcd0c3a7eccbf"},
{file = "mypy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ddae0f39ca146972ff6bb4399f3b2943884a774b8771ea0a8f50e971f5ea5ba8"},
{file = "mypy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1c4c42c60a8103ead4c1c060ac3cdd3ff01e18fddce6f1016e08939647a0e703"},
{file = "mypy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e86c2c6852f62f8f2b24cb7a613ebe8e0c7dc1402c61d36a609174f63e0ff017"},
{file = "mypy-1.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f9dca1e257d4cc129517779226753dbefb4f2266c4eaad610fc15c6a7e14283e"},
{file = "mypy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:95d8d31a7713510685b05fbb18d6ac287a56c8f6554d88c19e73f724a445448a"},
{file = "mypy-1.3.0-py3-none-any.whl", hash = "sha256:a8763e72d5d9574d45ce5881962bc8e9046bf7b375b0abf031f3e6811732a897"},
{file = "mypy-1.3.0.tar.gz", hash = "sha256:e1f4d16e296f5135624b34e8fb741eb0eadedca90862405b1f1fde2040b9bd11"},
{file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"},
{file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"},
{file = "mypy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b2353a44d2179846a096e25691d54d59904559f4232519d420d64da6828a3a7"},
{file = "mypy-1.13.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0730d1c6a2739d4511dc4253f8274cdd140c55c32dfb0a4cf8b7a43f40abfa6f"},
{file = "mypy-1.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5fc54dbb712ff5e5a0fca797e6e0aa25726c7e72c6a5850cfd2adbc1eb0a372"},
{file = "mypy-1.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:581665e6f3a8a9078f28d5502f4c334c0c8d802ef55ea0e7276a6e409bc0d82d"},
{file = "mypy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3ddb5b9bf82e05cc9a627e84707b528e5c7caaa1c55c69e175abb15a761cec2d"},
{file = "mypy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20c7ee0bc0d5a9595c46f38beb04201f2620065a93755704e141fcac9f59db2b"},
{file = "mypy-1.13.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3790ded76f0b34bc9c8ba4def8f919dd6a46db0f5a6610fb994fe8efdd447f73"},
{file = "mypy-1.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:51f869f4b6b538229c1d1bcc1dd7d119817206e2bc54e8e374b3dfa202defcca"},
{file = "mypy-1.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5c7051a3461ae84dfb5dd15eff5094640c61c5f22257c8b766794e6dd85e72d5"},
{file = "mypy-1.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:39bb21c69a5d6342f4ce526e4584bc5c197fd20a60d14a8624d8743fffb9472e"},
{file = "mypy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:164f28cb9d6367439031f4c81e84d3ccaa1e19232d9d05d37cb0bd880d3f93c2"},
{file = "mypy-1.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4c1bfcdbce96ff5d96fc9b08e3831acb30dc44ab02671eca5953eadad07d6d0"},
{file = "mypy-1.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0affb3a79a256b4183ba09811e3577c5163ed06685e4d4b46429a271ba174d2"},
{file = "mypy-1.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a7b44178c9760ce1a43f544e595d35ed61ac2c3de306599fa59b38a6048e1aa7"},
{file = "mypy-1.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5d5092efb8516d08440e36626f0153b5006d4088c1d663d88bf79625af3d1d62"},
{file = "mypy-1.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2904956dac40ced10931ac967ae63c5089bd498542194b436eb097a9f77bc8"},
{file = "mypy-1.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7bfd8836970d33c2105562650656b6846149374dc8ed77d98424b40b09340ba7"},
{file = "mypy-1.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:9f73dba9ec77acb86457a8fc04b5239822df0c14a082564737833d2963677dbc"},
{file = "mypy-1.13.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:100fac22ce82925f676a734af0db922ecfea991e1d7ec0ceb1e115ebe501301a"},
{file = "mypy-1.13.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7bcb0bb7f42a978bb323a7c88f1081d1b5dee77ca86f4100735a6f541299d8fb"},
{file = "mypy-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bde31fc887c213e223bbfc34328070996061b0833b0a4cfec53745ed61f3519b"},
{file = "mypy-1.13.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:07de989f89786f62b937851295ed62e51774722e5444a27cecca993fc3f9cd74"},
{file = "mypy-1.13.0-cp38-cp38-win_amd64.whl", hash = "sha256:4bde84334fbe19bad704b3f5b78c4abd35ff1026f8ba72b29de70dda0916beb6"},
{file = "mypy-1.13.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0246bcb1b5de7f08f2826451abd947bf656945209b140d16ed317f65a17dc7dc"},
{file = "mypy-1.13.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f5b7deae912cf8b77e990b9280f170381fdfbddf61b4ef80927edd813163732"},
{file = "mypy-1.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7029881ec6ffb8bc233a4fa364736789582c738217b133f1b55967115288a2bc"},
{file = "mypy-1.13.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3e38b980e5681f28f033f3be86b099a247b13c491f14bb8b1e1e134d23bb599d"},
{file = "mypy-1.13.0-cp39-cp39-win_amd64.whl", hash = "sha256:a6789be98a2017c912ae6ccb77ea553bbaf13d27605d2ca20a76dfbced631b24"},
{file = "mypy-1.13.0-py3-none-any.whl", hash = "sha256:9c250883f9fd81d212e0952c92dbfcc96fc237f4b7c92f56ac81fd48460b3e5a"},
{file = "mypy-1.13.0.tar.gz", hash = "sha256:0291a61b6fbf3e6673e3405cfcc0e7650bebc7939659fdca2702958038bd835e"},
]
[package.dependencies]
mypy-extensions = ">=1.0.0"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typing-extensions = ">=3.10"
typing-extensions = ">=4.6.0"
[package.extras]
dmypy = ["psutil (>=4.0)"]
faster-cache = ["orjson"]
install-types = ["pip"]
python2 = ["typed-ast (>=1.4.0,<2)"]
mypyc = ["setuptools (>=50)"]
reports = ["lxml"]
[[package]]
@@ -2514,11 +2480,9 @@ files = [
[package.dependencies]
colorama = {version = "*", markers = "sys_platform == \"win32\""}
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
iniconfig = "*"
packaging = "*"
pluggy = ">=0.12,<2.0"
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
@@ -2581,10 +2545,7 @@ files = [
]
[package.dependencies]
pytest = [
{version = ">=5.0", markers = "python_version < \"3.10\""},
{version = ">=6.2.4", markers = "python_version >= \"3.10\""},
]
pytest = {version = ">=6.2.4", markers = "python_version >= \"3.10\""}
[[package]]
name = "pytest-repeat"
@@ -3092,17 +3053,6 @@ files = [
{file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
]
[[package]]
name = "tomli"
version = "2.0.1"
description = "A lil' TOML parser"
optional = false
python-versions = ">=3.7"
files = [
{file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
]
[[package]]
name = "types-jwcrypto"
version = "1.5.0.20240925"
@@ -3359,16 +3309,6 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3523,21 +3463,6 @@ idna = ">=2.0"
multidict = ">=4.0"
propcache = ">=0.2.0"
[[package]]
name = "zipp"
version = "3.19.1"
description = "Backport of pathlib-compatible object wrapper for zip files"
optional = false
python-versions = ">=3.8"
files = [
{file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
{file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
]
[package.extras]
doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
[[package]]
name = "zstandard"
version = "0.21.0"
@@ -3598,5 +3523,5 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "8cb9c38d83eec441391c0528ac2fbefde18c734373b2399e07c69382044e8ced"
python-versions = "^3.11"
content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486"

View File

@@ -6,6 +6,7 @@ use tokio_postgres::config::SslMode;
use tracing::{info, info_span};
use super::ComputeCredentialKeys;
use crate::auth::IpPattern;
use crate::cache::Cached;
use crate::config::AuthenticationConfig;
use crate::context::RequestContext;
@@ -74,10 +75,10 @@ impl ConsoleRedirectBackend {
ctx: &RequestContext,
auth_config: &'static AuthenticationConfig,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<ConsoleRedirectNodeInfo> {
) -> auth::Result<(ConsoleRedirectNodeInfo, Option<Vec<IpPattern>>)> {
authenticate(ctx, auth_config, &self.console_uri, client)
.await
.map(ConsoleRedirectNodeInfo)
.map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist))
}
}
@@ -102,7 +103,7 @@ async fn authenticate(
auth_config: &'static AuthenticationConfig,
link_uri: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<NodeInfo> {
) -> auth::Result<(NodeInfo, Option<Vec<IpPattern>>)> {
ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect);
// registering waiter can fail if we get unlucky with rng.
@@ -176,9 +177,12 @@ async fn authenticate(
config.password(password.as_ref());
}
Ok(NodeInfo {
config,
aux: db_info.aux,
allow_self_signed_compute: false, // caller may override
})
Ok((
NodeInfo {
config,
aux: db_info.aux,
allow_self_signed_compute: false, // caller may override
},
db_info.allowed_ips,
))
}

View File

@@ -132,6 +132,93 @@ struct JwkSet<'a> {
keys: Vec<&'a RawValue>,
}
/// Given a jwks_url, fetch the JWKS and parse out all the signing JWKs.
/// Returns `None` and log a warning if there are any errors.
async fn fetch_jwks(
client: &reqwest_middleware::ClientWithMiddleware,
jwks_url: url::Url,
) -> Option<jose_jwk::JwkSet> {
let req = client.get(jwks_url.clone());
// TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
let resp = req.send().await.and_then(|r| {
r.error_for_status()
.map_err(reqwest_middleware::Error::Reqwest)
});
let resp = match resp {
Ok(r) => r,
// TODO: should we re-insert JWKs if we want to keep this JWKs URL?
// I expect these failures would be quite sparse.
Err(e) => {
tracing::warn!(url=?jwks_url, error=?e, "could not fetch JWKs");
return None;
}
};
let resp: http::Response<reqwest::Body> = resp.into();
let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE).await {
Ok(bytes) => bytes,
Err(e) => {
tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs");
return None;
}
};
let jwks = match serde_json::from_slice::<JwkSet>(&bytes) {
Ok(jwks) => jwks,
Err(e) => {
tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs");
return None;
}
};
// `jose_jwk::Jwk` is quite large (288 bytes). Let's not pre-allocate for what we don't need.
//
// Even though we limit our responses to 64KiB, we could still receive a payload like
// `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`. Parsing this as `RawValue` uses 468KiB.
// Pre-allocating the corresponding `Vec::<jose_jwk::Jwk>::with_capacity(30000)` uses 8.2MiB.
let mut keys = vec![];
let mut failed = 0;
for key in jwks.keys {
let key = match serde_json::from_str::<jose_jwk::Jwk>(key.get()) {
Ok(key) => key,
Err(e) => {
tracing::debug!(url=?jwks_url, failed=?e, "could not decode JWK");
failed += 1;
continue;
}
};
// if `use` (called `cls` in rust) is specified to be something other than signing,
// we can skip storing it.
if key
.prm
.cls
.as_ref()
.is_some_and(|c| *c != jose_jwk::Class::Signing)
{
continue;
}
keys.push(key);
}
keys.shrink_to_fit();
if failed > 0 {
tracing::warn!(url=?jwks_url, failed, "could not decode JWKs");
}
if keys.is_empty() {
tracing::warn!(url=?jwks_url, "no valid JWKs found inside the response body");
return None;
}
Some(jose_jwk::JwkSet { keys })
}
impl JwkCacheEntryLock {
async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
JwkRenewalPermit::acquire_permit(self).await
@@ -166,87 +253,15 @@ impl JwkCacheEntryLock {
// TODO(conrad): run concurrently
// TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
for rule in rules {
let req = client.get(rule.jwks_url.clone());
// TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
// TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
match req.send().await.and_then(|r| {
r.error_for_status()
.map_err(reqwest_middleware::Error::Reqwest)
}) {
// todo: should we re-insert JWKs if we want to keep this JWKs URL?
// I expect these failures would be quite sparse.
Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
Ok(r) => {
let resp: http::Response<reqwest::Body> = r.into();
let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE)
.await
{
Ok(bytes) => bytes,
Err(e) => {
tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
continue;
}
};
match serde_json::from_slice::<JwkSet>(&bytes) {
Err(e) => {
tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
}
Ok(jwks) => {
// size_of::<&RawValue>() == 16
// size_of::<jose_jwk::Jwk>() == 288
// better to not pre-allocate this as it might be pretty large - especially if it has many
// keys we don't want or need.
// trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`
// this would consume 8MiB just like that!
let mut keys = vec![];
let mut failed = 0;
for key in jwks.keys {
match serde_json::from_str::<jose_jwk::Jwk>(key.get()) {
Ok(key) => {
// if `use` (called `cls` in rust) is specified to be something other than signing,
// we can skip storing it.
if key
.prm
.cls
.as_ref()
.is_some_and(|c| *c != jose_jwk::Class::Signing)
{
continue;
}
keys.push(key);
}
Err(e) => {
tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK");
failed += 1;
}
}
}
keys.shrink_to_fit();
if failed > 0 {
tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs");
}
if keys.is_empty() {
tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body");
continue;
}
let jwks = jose_jwk::JwkSet { keys };
key_sets.insert(
rule.id,
KeySet {
jwks,
audience: rule.audience,
role_names: rule.role_names,
},
);
}
};
}
if let Some(jwks) = fetch_jwks(client, rule.jwks_url).await {
key_sets.insert(
rule.id,
KeySet {
jwks,
audience: rule.audience,
role_names: rule.role_names,
},
);
}
}

View File

@@ -6,7 +6,6 @@ pub mod local;
use std::net::IpAddr;
use std::sync::Arc;
use std::time::Duration;
pub use console_redirect::ConsoleRedirectBackend;
pub(crate) use console_redirect::ConsoleRedirectError;
@@ -30,7 +29,7 @@ use crate::intern::EndpointIdInt;
use crate::metrics::Metrics;
use crate::proxy::connect_compute::ComputeConnectBackend;
use crate::proxy::NeonOptions;
use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter};
use crate::stream::Stream;
use crate::types::{EndpointCacheKey, EndpointId, RoleName};
use crate::{scram, stream};
@@ -192,21 +191,6 @@ impl MaskedIp {
// This can't be just per IP because that would limit some PaaS that share IP addresses
pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>;
impl RateBucketInfo {
/// All of these are per endpoint-maskedip pair.
/// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
///
/// First bucket: 1000mcpus total per endpoint-ip pair
/// * 4096000 requests per second with 1 hash rounds.
/// * 1000 requests per second with 4096 hash rounds.
/// * 6.8 requests per second with 600000 hash rounds.
pub const DEFAULT_AUTH_SET: [Self; 3] = [
Self::new(1000 * 4096, Duration::from_secs(1)),
Self::new(600 * 4096, Duration::from_secs(60)),
Self::new(300 * 4096, Duration::from_secs(600)),
];
}
impl AuthenticationConfig {
pub(crate) fn check_rate_limit(
&self,

View File

@@ -428,8 +428,9 @@ async fn main() -> anyhow::Result<()> {
)?))),
None => None,
};
let cancellation_handler = Arc::new(CancellationHandler::<
Option<Arc<tokio::sync::Mutex<RedisPublisherClient>>>,
Option<Arc<Mutex<RedisPublisherClient>>>,
>::new(
cancel_map.clone(),
redis_publisher,

View File

@@ -1,7 +1,8 @@
use std::net::SocketAddr;
use std::net::{IpAddr, SocketAddr};
use std::sync::Arc;
use dashmap::DashMap;
use ipnet::{IpNet, Ipv4Net, Ipv6Net};
use pq_proto::CancelKeyData;
use thiserror::Error;
use tokio::net::TcpStream;
@@ -10,8 +11,10 @@ use tokio_postgres::{CancelToken, NoTls};
use tracing::{debug, info};
use uuid::Uuid;
use crate::auth::{check_peer_addr_is_in_list, IpPattern};
use crate::error::ReportableError;
use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
use crate::rate_limiter::LeakyBucketRateLimiter;
use crate::redis::cancellation_publisher::{
CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
};
@@ -20,6 +23,8 @@ pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
pub(crate) type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
type IpSubnetKey = IpNet;
/// Enables serving `CancelRequest`s.
///
/// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
@@ -29,14 +34,23 @@ pub struct CancellationHandler<P> {
/// This field used for the monitoring purposes.
/// Represents the source of the cancellation request.
from: CancellationSource,
// rate limiter of cancellation requests
limiter: Arc<std::sync::Mutex<LeakyBucketRateLimiter<IpSubnetKey>>>,
}
#[derive(Debug, Error)]
pub(crate) enum CancelError {
#[error("{0}")]
IO(#[from] std::io::Error),
#[error("{0}")]
Postgres(#[from] tokio_postgres::Error),
#[error("rate limit exceeded")]
RateLimit,
#[error("IP is not allowed")]
IpNotAllowed,
}
impl ReportableError for CancelError {
@@ -47,6 +61,8 @@ impl ReportableError for CancelError {
crate::error::ErrorKind::Postgres
}
CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
CancelError::IpNotAllowed => crate::error::ErrorKind::User,
}
}
}
@@ -79,13 +95,36 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
cancellation_handler: self,
}
}
/// Try to cancel a running query for the corresponding connection.
/// If the cancellation key is not found, it will be published to Redis.
/// check_allowed - if true, check if the IP is allowed to cancel the query
pub(crate) async fn cancel_session(
&self,
key: CancelKeyData,
session_id: Uuid,
peer_addr: &IpAddr,
check_allowed: bool,
) -> Result<(), CancelError> {
// TODO: check for unspecified address is only for backward compatibility, should be removed
if !peer_addr.is_unspecified() {
let subnet_key = match *peer_addr {
IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
};
if !self.limiter.lock().unwrap().check(subnet_key, 1) {
tracing::debug!("Rate limit exceeded. Skipping cancellation message");
Metrics::get()
.proxy
.cancellation_requests_total
.inc(CancellationRequest {
source: self.from,
kind: crate::metrics::CancellationOutcome::RateLimitExceeded,
});
return Err(CancelError::RateLimit);
}
}
// NB: we should immediately release the lock after cloning the token.
let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
tracing::warn!("query cancellation key not found: {key}");
@@ -96,7 +135,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
source: self.from,
kind: crate::metrics::CancellationOutcome::NotFound,
});
match self.client.try_publish(key, session_id).await {
if session_id == Uuid::nil() {
// was already published, do not publish it again
return Ok(());
}
match self.client.try_publish(key, session_id, *peer_addr).await {
Ok(()) => {} // do nothing
Err(e) => {
return Err(CancelError::IO(std::io::Error::new(
@@ -107,6 +152,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
}
return Ok(());
};
if check_allowed
&& !check_peer_addr_is_in_list(peer_addr, cancel_closure.ip_allowlist.as_slice())
{
return Err(CancelError::IpNotAllowed);
}
Metrics::get()
.proxy
.cancellation_requests_total
@@ -135,13 +187,29 @@ impl CancellationHandler<()> {
map,
client: (),
from,
limiter: Arc::new(std::sync::Mutex::new(
LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
64,
),
)),
}
}
}
impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: CancellationSource) -> Self {
Self { map, client, from }
Self {
map,
client,
from,
limiter: Arc::new(std::sync::Mutex::new(
LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
64,
),
)),
}
}
}
@@ -152,13 +220,19 @@ impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
pub struct CancelClosure {
socket_addr: SocketAddr,
cancel_token: CancelToken,
ip_allowlist: Vec<IpPattern>,
}
impl CancelClosure {
pub(crate) fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
pub(crate) fn new(
socket_addr: SocketAddr,
cancel_token: CancelToken,
ip_allowlist: Vec<IpPattern>,
) -> Self {
Self {
socket_addr,
cancel_token,
ip_allowlist,
}
}
/// Cancels the query running on user's compute node.
@@ -168,6 +242,9 @@ impl CancelClosure {
debug!("query was cancelled");
Ok(())
}
pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec<IpPattern>) {
self.ip_allowlist = ip_allowlist;
}
}
/// Helper for registering query cancellation tokens.
@@ -229,6 +306,8 @@ mod tests {
cancel_key: 0,
},
Uuid::new_v4(),
&("127.0.0.1".parse().unwrap()),
true,
)
.await
.unwrap();

View File

@@ -342,7 +342,7 @@ impl ConnCfg {
// NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
// Yet another reason to rework the connection establishing code.
let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token(), vec![]);
let connection = PostgresConnection {
stream,

View File

@@ -1,6 +1,6 @@
use std::sync::Arc;
use futures::TryFutureExt;
use futures::{FutureExt, TryFutureExt};
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, Instrument};
@@ -88,40 +88,37 @@ pub async fn task_main(
crate::metrics::Protocol::Tcp,
&config.region,
);
let span = ctx.span();
let startup = Box::pin(
handle_client(
config,
backend,
&ctx,
cancellation_handler,
socket,
conn_gauge,
)
.instrument(span.clone()),
);
let res = startup.await;
let res = handle_client(
config,
backend,
&ctx,
cancellation_handler,
socket,
conn_gauge,
)
.instrument(ctx.span())
.boxed()
.await;
match res {
Err(e) => {
// todo: log and push to ctx the error kind
ctx.set_error_kind(e.get_error_kind());
error!(parent: &span, "per-client task finished with an error: {e:#}");
error!(parent: &ctx.span(), "per-client task finished with an error: {e:#}");
}
Ok(None) => {
ctx.set_success();
}
Ok(Some(p)) => {
ctx.set_success();
ctx.log_connect();
match p.proxy_pass().instrument(span.clone()).await {
let _disconnect = ctx.log_connect();
match p.proxy_pass().await {
Ok(()) => {}
Err(ErrorSource::Client(e)) => {
error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
error!(?session_id, "per-client task finished with an IO error from the client: {e:#}");
}
Err(ErrorSource::Compute(e)) => {
error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}");
error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}");
}
}
}
@@ -156,16 +153,21 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let request_gauge = metrics.connection_requests.guard(proto);
let tls = config.tls_config.as_ref();
let record_handshake_error = !ctx.has_private_peer_addr();
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
let do_handshake = handshake(ctx, stream, tls, record_handshake_error);
let (mut stream, params) =
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
HandshakeData::Startup(stream, params) => (stream, params),
HandshakeData::Cancel(cancel_key_data) => {
return Ok(cancellation_handler
.cancel_session(cancel_key_data, ctx.session_id())
.cancel_session(
cancel_key_data,
ctx.session_id(),
&ctx.peer_addr(),
config.authentication_config.ip_allowlist_check_enabled,
)
.await
.map(|()| None)?)
}
@@ -174,7 +176,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
ctx.set_db_options(params.clone());
let user_info = match backend
let (user_info, ip_allowlist) = match backend
.authenticate(ctx, &config.authentication_config, &mut stream)
.await
{
@@ -198,6 +200,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
.or_else(|e| stream.throw_error(e))
.await?;
node.cancel_closure
.set_ip_allowlist(ip_allowlist.unwrap_or_default());
let session = cancellation_handler.get_session();
prepare_client_connection(&node, &session, &mut stream).await?;
@@ -212,6 +216,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
client: stream,
aux: node.aux.clone(),
compute: node,
session_id: ctx.session_id(),
_req: request_gauge,
_conn: conn_gauge,
_cancel: session,

View File

@@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams;
use smol_str::SmolStr;
use tokio::sync::mpsc;
use tracing::field::display;
use tracing::{debug, info_span, Span};
use tracing::{debug, error, info_span, Span};
use try_lock::TryLock;
use uuid::Uuid;
@@ -272,11 +272,14 @@ impl RequestContext {
this.success = true;
}
pub fn log_connect(&self) {
self.0
.try_lock()
.expect("should not deadlock")
.log_connect();
pub fn log_connect(self) -> DisconnectLogger {
let mut this = self.0.into_inner();
this.log_connect();
// close current span.
this.span = Span::none();
DisconnectLogger(this)
}
pub(crate) fn protocol(&self) -> Protocol {
@@ -412,9 +415,11 @@ impl RequestContextInner {
});
}
if let Some(tx) = self.sender.take() {
tx.send(RequestData::from(&*self))
.inspect_err(|e| debug!("tx send failed: {e}"))
.ok();
// If type changes, this error handling needs to be updated.
let tx: mpsc::UnboundedSender<RequestData> = tx;
if let Err(e) = tx.send(RequestData::from(&*self)) {
error!("log_connect channel send failed: {e}");
}
}
}
@@ -423,9 +428,11 @@ impl RequestContextInner {
// Here we log the length of the session.
self.disconnect_timestamp = Some(Utc::now());
if let Some(tx) = self.disconnect_sender.take() {
tx.send(RequestData::from(&*self))
.inspect_err(|e| debug!("tx send failed: {e}"))
.ok();
// If type changes, this error handling needs to be updated.
let tx: mpsc::UnboundedSender<RequestData> = tx;
if let Err(e) = tx.send(RequestData::from(&*self)) {
error!("log_disconnect channel send failed: {e}");
}
}
}
}
@@ -434,8 +441,14 @@ impl Drop for RequestContextInner {
fn drop(&mut self) {
if self.sender.is_some() {
self.log_connect();
} else {
self.log_disconnect();
}
}
}
pub struct DisconnectLogger(RequestContextInner);
impl Drop for DisconnectLogger {
fn drop(&mut self) {
self.0.log_disconnect();
}
}

Some files were not shown because too many files have changed in this diff Show More